NekoHTM

最新推荐文章于 2021-03-20 20:27:05 发布

转载最新推荐文章于 2021-03-20 20:27:05 发布 · 933 阅读

收录于

当前文章被以下社区和专栏收录：

转自http://jlm0808.blogcn.com/diary,113856063.shtml

package com.sample;

/**
*@author Jerry Chiang
*@version 1.0
*/

import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;

public class TestHTMLDOM {

private static String test_url1 = "http://www.baidu.com";
private static String str1;

//main method
public static void main(String[] argv) throws Exception{
  DOMParser parser = new DOMParser();      //实体化解析器
  parser.parse(test_url1);      //对给定的HTML文档解析
  print(parser.getDocument(), "");    //输出DOM树
  System.out.println("该网页中总字数为："+str1.length());   //统计#text结点的字数
}

//print method
private static void print(Node node, String indent) {
//  System.out.println(indent+node.getNodeName()); //输出结点名字
  if (node.getNodeValue() != null) {
   if("".equals(node.getNodeValue().trim())){
   }
    else{
     if(true){
      System.out.print(indent);
      System.out.println(node.getNodeValue().trim()+node.getNodeName()); //输出结点内容
      if(node.getNodeType() == Node.TEXT_NODE){
       str1 += node.getNodeValue().trim();   //将结点内容赋值给String
      }
     }
    }
  }
  Node child = node.getFirstChild();
  while (child != null) {
   print(child, indent+" ");
   child = child.getNextSibling();
  }
}

}