最近学习了一下Beautifulsoup4,便用BS4写了一个爬虫的小程序,可以从网页上下载图片保存到本地,并且可以自动提取网页上的链接放到链接库里以便继续爬,并且保存所有已经爬取过的网页,对于已经下载过的图片,便不再进行保存操作。小程序目前还没有写断点续爬功能,以后有时间再补上。
import os
import time
import re
import urllib3
import urllib
from bs4 import BeautifulSoup
from PIL import Image
import hashlib
headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/1 7.0.963.56 Safari/535.11"}
#填入你要爬取的网页地址
url = "http://www.xxxxxxxxxxxxxxxxx"
urllist = []#当前需要爬取的网页列表
urllistAll = []#所有已经爬取过的网页列表,用来过滤新抓取的网页是否已经被爬取过
md5_list = []#保存所有文件的哈希值,用来过滤新爬取的图片是否已经被爬取过
#定义一个爬虫类
class Spider:
def __init__(self , containt , point = 0):
print("HelloWorld")
self.containt = containt
self.point = point
def getURL(self , inputURL):#导入原始要爬取的网页地址
urllist.append(inputURL)
urllistAll.append(inputURL)
print(len(urllist))
def findURL(self):#从原始网页开始提取最的链接
try:
http = urllib3.PoolManager()
for url in urllist:
urllist.remove(url)
self.saveList()
r = http.request("GET" , url , headers=headers ,timeout = 10)
# print(r.status)
soup = BeautifulSoup(r.data.decode() , "lxml")
self.fineImgUrl(soup)
for i in soup.find_all(name = 'a'):
# print(i)
pass
for i in soup.find_all(name='a', attrs = {"href":re.compile(r'^http:')}):
urlTmp = i.get('href')
# print(urlTmp)
if urlTmp in urllistAll:
# print(urlTmp)
pass
else:
urllist.append(urlTmp)
urllistAll.append(urlTmp)
except:
print("ops... an err occured!")
def fineImgUrl(self , soup):#提取出当前网页的图片链接
try:
for i in soup.find_all(name="img", attrs = {"src":re.compile(r'^http:')}):
img_url = i.get('src')
print(img_url)
self.imgSave(img_url)
except:
print("OPS*** An err occured!")
def imgSave(self,imgUrl):#保存图片到本地磁盘
try:
http = urllib3.PoolManager()
imgData = http.request("GET",imgUrl,headers=headers ,timeout = 10)
get_md5 = hashlib.md5()
get_md5.update(imgData.data)
md5_value = get_md5.hexdigest()
print(md5_value)
if md5_value in md5_list:
print("This image is exist, pass...")
else:
md5_list.append(md5_value)
print(len(str(imgData.data)))
# time.sleep(5)
cnt_str = imgUrl.replace("/", "a")
cnt_str = cnt_str.replace(":", "b")
fileName = cnt_str
print(fileName)
with open(fileName, "wb") as f:
f.write(imgData.data)
print("saving img " + fileName)
except:
print("OPS.........Save image failed...")
def saveList(self):#保存已经爬取过的网页地址
try:
with open("AllList.txt", "w") as fb:
for i in urllistAll:
fb.write(i)
fb.write("\r\n")
print("Saved all urllib!!")
except:
print("Save list failed~~~")
def __str__(self):返回当前等待爬取的网页地址列表
msg = "This is a network spider, nice to meet you!"
if len(urllist) > 0:
msg += "my URL list contain " + str(len(urllist))
msg += "Item, they are "
for tmp in urllist:
msg += str(tmp)
msg += " , "
return msg
#主函数
if __name__ == "__main__":
os.system("mkdir pic6")
os.chdir("pic6")
spider = Spider("hello")
print(spider)
spider.getURL(url)
print(spider)
while True:
spider.findURL()
print(spider)
spider.saveList()


被折叠的 条评论
为什么被折叠?



