爬虫-下载上万张煎蛋网旧图片

继上一篇从煎蛋网下载妹子图片后,在网上发现另外两个途径可以下载到煎蛋网往年的旧图片。第一个是个网页,作者把所有图片地址写入一个网页中,没有一个文字只有图片有五千多张。第二个是一个本地html文件,也是一样用浏览器打开后全是图片,有六千多张。据我后面看着两个地方的图片还没有发现重复的。全部下载完,除去一些失效的图片链接,有上万张。

总的来说,写这个爬虫比较简单,没有复杂的标签,没有分页码,简单粗暴方法就可以应付了。

思路是:1. 用requests得到数据后,分析出所有图片的地址,写入一个allurl.txt文件中。2.从allurl.txt文件中读取图片地址,并下载,下载不成功的地址写入passurl.txt, 成功的地址写入okurl.txt中。3. 把第2步中passurl.txt该名为allurl.txt, 删除okurl.txt和旧的allurl.txt并打开代理或者其他重复第二步,直到没有更多的地址可以下载。

一. 从网页上获取数据写入allurl.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
import chardet
from bs4 import BeautifulSoup
import os
import pickle

#页面的soup数据
def data(url):
agent = 'User-Agent'
agentvalue = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
headers = {agent:agentvalue}
#proxy = {'http':'120.78.174.170:8080'} , proxies=proxy
response = requests.get(url=url, headers=headers, timeout = 5)
global code
code = chardet.detect(response.content)['encoding']
response.encoding = code
html = response.text
#print(html)
soup = BeautifulSoup(html, 'html.parser')
return soup

#页面所有图片的地址列表
def getpicurl(soup):
list1 = soup.find_all('img')
for eachurl in list1:
picurl = eachurl['src']
yield picurl

#主程序会生成一个包含有所有图片下载地址的allurl.txt的二进制文件
def main():
allurllist = []
try:#新建一个文件夹,如果已经存在就pass
os.mkdir('D:\download\pic')
except:
pass
os.chdir(r'D:\download\pic') #改变工作目录

url = r'http://js.funet8.com/html/jiandan-meizhi.html' #网站地址
soupdata = data(url) #网站的soup数据
pictureurl = getpicurl(soupdata) #生成器产生的每张图片的地址
for eachpicurl in pictureurl:
#对地址进行删选。由于网页代码的一些问题,导致地址出现一些错误
if '<img src=' in eachpicurl:
continue
if ' />' in eachpicurl:
eachpicurl = eachpicurl.replace(' />', '')
#修改一下可以得到大图的地址
if 'sinaimg' in eachpicurl:
if 'bmiddle' in eachpicurl:
eachpicurl = eachpicurl.replace('bmiddle', 'large')
if 'mw690' in eachpicurl:
eachpicurl = eachpicurl.replace('mw690', 'large')
if 'mw600' in eachpicurl:
eachpicurl = eachpicurl.replace('mw600', 'large')
if 'thumbnail' in eachpicurl:
eachpicurl = eachpicurl.replace('thumbnail', 'large')
if 'small' in eachpicurl:
eachpicurl = eachpicurl.replace('small', 'large')
if 'thumb150' in eachpicurl:
eachpicurl = eachpicurl.replace('thumb150', 'large')
#把所有地址组成的列表写入文件
allurllist.append(eachpicurl)
with open('allurl.txt', 'wb') as f2:
pickle.dump(allurllist, f2)


if __name__ == '__main__':
main()

二.从本地html文件上获取数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import requests
import chardet
from bs4 import BeautifulSoup
import os
import pickle

#页面的soup数据
def data(url):
agent = 'User-Agent'
agentvalue = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
headers = {agent:agentvalue}
#proxy = {'http':'120.78.174.170:8080'} , proxies=proxy

response = requests.get(url=url, headers=headers, timeout = 5)
global code
code = chardet.detect(response.content)['encoding']
response.encoding = code
html = response.text
#print(html)
soup = BeautifulSoup(html, 'html.parser')
return soup

#页面所有图片的地址列表
def getpicurl(soup):
list1 = soup.find_all('img')
for eachurl in list1:
picurl = eachurl['src']
yield picurl

def main():
allurllist = []

try:#新建一个文件夹,如果已经存在就pass
os.mkdir('D:\download\pic')
except:
pass
os.chdir(r'D:\download\pic') #改变工作目录

url = r'E:\python files\IDEL FILES\My py file\爬虫煎蛋网旧图片\jiandan.html' #文件保存地址,需要根据实际情况变化

#读取本地html文件,注意open的参数需要加上encoding的参数,否则会出现编码错误,至于编码方法可以用chardet测试
#得到的f是一个迭代器,迭代出的是包含每个图片地址标签的字符串,如果我们直接对这个标签字符串进行操作会很麻烦,所以用BeautifulSoup进行解析
with open(url, mode='r', encoding='utf-8') as f:
soupdata = BeautifulSoup(f, 'html.parser')
pictureurl = getpicurl(soupdata)
for eachpicurl in pictureurl:
if '<img src=' in eachpicurl:
continue
if ' />' in eachpicurl:
eachpicurl = eachpicurl.replace(' />', '')
if 'sinaimg' in eachpicurl:
if 'bmiddle' in eachpicurl:
eachpicurl = eachpicurl.replace('bmiddle', 'large')
if 'mw690' in eachpicurl:
eachpicurl = eachpicurl.replace('mw690', 'large')
if 'mw600' in eachpicurl:
eachpicurl = eachpicurl.replace('mw600', 'large')
if 'thumbnail' in eachpicurl:
eachpicurl = eachpicurl.replace('thumbnail', 'large')
if 'small' in eachpicurl:
eachpicurl = eachpicurl.replace('small', 'large')
if 'thumb150' in eachpicurl:
eachpicurl = eachpicurl.replace('thumb150', 'large')
allurllist.append(eachpicurl)
with open('allurl.txt', 'wb') as f2:
pickle.dump(allurllist, f2)

if __name__ == '__main__':
main()
三. 从allurl.txt读取数据下载图片
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import requests
import chardet
from bs4 import BeautifulSoup
import os.path
import os
import pickle


#定义函数返回地址二进制数据内容
def bhtml(url):
agent = 'User-Agent'
agentvalue = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
headers = {agent:agentvalue}
#proxy = {'http':'120.78.174.170:8080'} , proxies=proxy
response = requests.get(url=url, headers=headers, timeout = 5)
bhtml = response.content
return bhtml

#页面所有图片的地址列表
def getpicurl(soup):
list1 = soup.find_all('img')
for eachurl in list1:
picurl = eachurl['src']
yield picurl

#从allurl.txt读取数据并下载图片
def main():
try:#新建一个文件夹,如果已经存在就pass
os.mkdir('D:\download\pic')
except:
pass
os.chdir(r'D:\download\pic') #改变工作目录

#如果第一次运行,就直接从allurl.txt读取数据, passurl和okurl都为空。
#如果中途因为各种原因停止了程序,那么从allurl中删除已经下载的和没有下载成功的地址,这样可以避免重启时重复下载图片。
try:
with open(r'D:\download\pic\passurl.txt', 'rb') as psurl:
passurllist = pickle.load(psurl)
with open(r'D:\download\pic\okurl.txt', 'rb') as goodurl:
okurllist = pickle.load(goodurl)
with open(r'D:\download\pic\allurl.txt', 'rb') as totalurl:
allurllist = pickle.load(totalurl)
except:
passurllist = []
okurllist = []
with open(r'D:\download\pic\allurl.txt', 'rb') as totalurl:
allurllist = pickle.load(totalurl)
else:
newalllist = [x for x in allurllist if x not in passurllist and x not in okurllist]
with open(r'D:\download\pic\allurl.txt', 'wb') as total:
pickle.dump(newalllist, total)
#遇到读取数据时出现问题就把地址保存在passurl里面,下载完成的保存在okurl里面。
for eachpicurl in allurllist:
picname = os.path.split(eachpicurl)[1]
try:
picdata = bhtml(eachpicurl)
except:
print(eachpicurl, '下载有问题')
passurllist.append(eachpicurl)
with open('passurl.txt', 'wb') as f1:
pickle.dump(passurllist, f1)
continue
else:
try:
with open(picname, 'wb') as f:
f.write(picdata)
except:
print(eachpicurl, '下载有问题')
passurllist.append(eachpicurl)
with open('passurl.txt', 'wb') as f4:
pickle.dump(passurllist, f4)
continue
else:
print(eachpicurl, '下载ok')
okurllist.append(eachpicurl)
with open('okurl.txt', 'wb') as f3:
pickle.dump(okurllist, f3)

if __name__ == '__main__':
main()

总结:

  1. 从本地html文件读取数据的方法
  2. 对异常的处理