NekoHTML HTML扫描器和标签补偿器

最新推荐文章于 2025-12-01 22:18:36 发布

转载最新推荐文章于 2025-12-01 22:18:36 发布 · 1.3k 阅读

标签

#html #import #exception #string #url

本文介绍了一个使用NekoHTML库解析HTML文档的Java程序实例。该程序能够处理不规范的HTML文档，将其转换为标准的XML结构，并通过打印文档的不同部分展示了如何遍历和获取节点信息。

引用

能解析HTML文档并用标准的XML接口来访问其中的信息。能增补缺失的父元素、自动用结束标签关闭相应的元素，以及不匹配的内嵌元素标签。

/*
* Created on 2005-6-7
*
* To change the template for this generated file go to
* Window>Preferences>Java>Code Generation>Code and Comments
*/
package com.lsz;

import java.io.PrintWriter;

import org.apache.html.dom.HTMLDocumentImpl;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.html.HTMLDocument;

/**
* @author Administrator
*
* To change the template for this generated type comment go to
* Window>Preferences>Java>Code Generation>Code and Comments
*/
public class NekoHtmlTest {
public static void main(String[] argv) throws Exception {
  NekoHtmlTest nh = new NekoHtmlTest();
  String url =
   "http://www.sentom.net/index.asp?page=2&classid=3&Nclassid=6";
  nh.printNodeValue(url);
  nh.parsePage(url);
  nh.fragment(url);
}
//打印每个节点的值
public void printNodeValue(String url) throws Exception {
  DOMParser parser = new DOMParser();
  parser.parse(url);
  print(parser.getDocument(), "");

}

public void fragment(String url) throws Exception {
  DOMFragmentParser parser = new DOMFragmentParser();
  HTMLDocument document = new HTMLDocumentImpl();

  DocumentFragment fragment = document.createDocumentFragment();
  parser.parse(url, fragment);
  print(fragment, "");

}

public static void print(Node node, String indent) {
  if (node.getNodeValue() != null) {
   if ("".equals(node.getNodeValue().trim())) {
   } else {
    System.out.print(indent);
    System.out.println(node.getNodeValue());
   }
  }

  Node child = node.getFirstChild();
  while (child != null) {
   print(child, indent + " ");
   child = child.getNextSibling();
  }
}

//打印整个页面
public void parsePage(String url) throws Exception {
  DOMParser parser = new DOMParser();
  parser.parse(url);
  print(parser.getDocument());
}

//和Jtidy的一样，但中文不需要做特殊处理
protected PrintWriter out = new PrintWriter(System.out);
public void print(Node node) {

  if (node == null) {
   return;
  }

  int type = node.getNodeType();
  switch (type) {
   case Node.DOCUMENT_NODE :
    print(((Document) node).getDocumentElement());
    out.flush();
    break;

   case Node.ELEMENT_NODE :
    out.print('<');

    out.print(node.getNodeName());
    NamedNodeMap attrs = node.getAttributes();

    for (int i = 0; i < attrs.getLength(); i++) {
     out.print(' ');
     out.print(attrs.item(i).getNodeName());
     out.print("=/"");

     out.print(attrs.item(i).getNodeValue());
     out.print('"');
    }
    out.print('>');
    out.println(); // HACK
    NodeList children = node.getChildNodes();
    if (children != null) {
     int len = children.getLength();
     for (int i = 0; i < len; i++) {
      print(children.item(i));
     }
    }
    break;

   case Node.TEXT_NODE :
    out.print(node.getNodeValue());
    break;

  }

  if (type == Node.ELEMENT_NODE) {
   out.print("</");
   out.print(node.getNodeName());
   out.print('>');
   out.println();
  }

  out.flush();

}

}