下面提供一个文件,该文件实现的功能有,java下访问网址,下载下来该页面到指定目录。可以设置代理服务器。
package ie;
/**
* @author webkkk(blog.csdn.net/webkkk)
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
public class JavaIe {
public static void main(String[] args) {
DownLoadPages("http://www.rdnovel.com/files/article/novelread/0/37/349424.html","C:/temp/log/");
}
public static void DownLoadPages(String urlStr, String outPath) {
int chByte = 0;
URL url = null;
HttpURLConnection httpConn = null;
InputStream in = null;
FileOutputStream out = null;
try {
// use proxy begin
System.getProperties().put("proxySet","true");
System.getProperties().setProperty( "http.proxyHost","172.16.64.10");
System.getProperties().setProperty( "http.proxyPort","12080");
// use proxy end
url = new URL(urlStr);
httpConn = (HttpURLConnection) url.openConnection();
HttpURLConnection.setFollowRedirects(true);
httpConn.setRequestMethod("GET");
httpConn.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows 2000)");
in = httpConn.getInputStream();
File file = new File(outPath);
out = new FileOutputStream(file);
chByte = in.read();
while (chByte != -1) {
out.write(chByte);
chByte = in.read();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
out.close();
in.close();
httpConn.disconnect();
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
}
此外,还有第二种方法可以访问Google的网站,就是用apache的一个工具HttpClient 模仿一个浏览器来访问Google
Document document = null;
HttpClient httpClient = new HttpClient();
GetMethod getMethod = new GetMethod(url);
getMethod.setFollowRedirects(true);
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode == HttpStatus.SC_OK)
{
InputStream in = getMethod.getResponseBodyAsStream();
InputSource is = new InputSource(in);
DOMParser domParser = new DOMParser(); //nekoHtml 将取得的网页转换成dom
domParser.parse(is);
document = domParser.getDocument();
System.out.println(getMethod.getURI());
}
return document;
推荐使用第一种方式,使用HttpConnection 比较轻量级,速度也比第二种HttpClient 的快。
关于java模拟ie form登陆web的问题
HttpURLConnection urlConn=(HttpURLConnection)(new URL(url).openConnection());
urlConn.addRequestProperty("Cookie",cookie);
urlConn.setRequestMethod("POST");
urlConn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows 2000)");
urlConn.setFollowRedirects(true);
urlConn.setDoOutput(true); // 需要向服务器写数据
urlConn.setDoInput(true); //
urlConn.setUseCaches(false); // 获得服务器最新的信息
urlConn.setAllowUserInteraction(false);
urlConn.setRequestProperty("Content-Type","application/x-www-form-urlencoded");
urlConn.setRequestProperty("Content-Language","en-US" );
urlConn.setRequestProperty("Content-Length", ""+data.length());
DataOutputStream outStream = new DataOutputStream(urlConn.getOutputStream());
outStream.writeBytes(data);
outStream.flush();
outStream.close();
cookie=urlConn.getHeaderField("Set-Cookie");
BufferedReader br=new BufferedReader(new InputStreamReader(urlConn.getInputStream(),"gb2312"));
这篇博客介绍了如何使用Java模拟IE浏览器访问并下载网页内容。通过设置System.properties实现代理,利用HttpURLConnection打开连接,设置请求头并读取输入流进行数据写入。同时还提到了使用HttpClient作为另一种访问网站的方法,但推荐使用HttpURLConnection因其轻量级和速度优势。

2万+

被折叠的 条评论
为什么被折叠?



