转自http://jlm0808.blogcn.com/diary,113856063.shtml
package com.sample;
/**
*@author Jerry Chiang
*@version 1.0
*/
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
public class TestHTMLDOM {
private static String test_url1 = "http://www.baidu.com";
private static String str1;
//main method
public static void main(String[] argv) throws Exception{
DOMParser parser = new DOMParser(); //实体化解析器
parser.parse(test_url1); //对给定的HTML文档解析
print(parser.getDocument(), ""); //输出DOM树
System.out.println("该网页中总字数为:"+str1.length()); //统计#text结点的字数
}
//print method
private static void print(Node node, String indent) {
// System.out.println(indent+node.getNodeName()); //输出结点名字
if (node.getNodeValue() != null) {
if("".equals(node.getNodeValue().trim())){
}
else{
if(true){
System.out.print(indent);
System.out.println(node.getNodeValue().trim()+node.getNodeName()); //输出结点内容
if(node.getNodeType() == Node.TEXT_NODE){
str1 += node.getNodeValue().trim(); //将结点内容赋值给String
}
}
}
}
Node child = node.getFirstChild();
while (child != null) {
print(child, indent+" ");
child = child.getNextSibling();
}
}
}
433

被折叠的 条评论
为什么被折叠?



