爬虫-精品聚合网下载图片

之前写了两篇下载煎蛋网图片的文章,这篇是精品聚合网妹子图片下载,与从煎蛋网下载极为类似,就不再细说,下载完成后有也有一万多张照片。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import requests
from bs4 import BeautifulSoup
import chardet
import os.path
import os
import time
import pickle

#定义函数返回地址二进制数据内容
def bhtml(url):
agent = 'User-Agent'
agentvalue = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
headers = {agent:agentvalue}
#proxy = {'http':'120.78.174.170:8080'} , proxies=proxy
response = requests.get(url=url, headers=headers)
bhtml = response.content
return bhtml

#每个地址的soup数据
def data(url):
agent = 'User-Agent'
agentvalue = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
headers = {agent:agentvalue}
#proxy = {'http':'120.78.174.170:8080'} , proxies=proxy

response = requests.get(url=url, headers=headers)
global code
code = chardet.detect(response.content)['encoding']
response.encoding = code
html = response.text
#print(html)
soup = BeautifulSoup(html, 'html.parser')
return soup


#所有分页的地址(生成器)
def getpage():
page = 12
for n in range(page, 0, -1):#页面倒数到1
pageurl = ''.join(['http://www.jingpinjuhe.com/?cat=26&paged=', str(n)])
yield pageurl


#所有分页的地址(生成器)
def getitempage(soup):
list1 = soup.find_all('a', attrs={'target':'_blank', 'title':True}) # <a target="_blank" href="url address" title="title message">some message</a>
for eachitem in list1:
itemurl = eachitem['href']
itemtitle = eachitem['title']
print(itemtitle, itemurl, '下载中....')
yield itemurl


#每个页面的图片地址,下载保存到本地,
def downpic(soup):
passurllist = []
list2 = soup.find_all('img', attrs={'src':True})
for pictag in list2:
eachpicurl = pictag['src']
if '<img src=' in eachpicurl:
continue
if ' />' in eachpicurl:
eachpicurl = eachpicurl.replace(' />', '')
if 'sinaimg' in eachpicurl:
if 'bmiddle' in eachpicurl:
eachpicurl = eachpicurl.replace('bmiddle', 'large')
if 'mw690' in eachpicurl:
eachpicurl = eachpicurl.replace('mw690', 'large')
if 'mw600' in eachpicurl:
eachpicurl = eachpicurl.replace('mw600', 'large')
if 'thumbnail' in eachpicurl:
eachpicurl = eachpicurl.replace('thumbnail', 'large')
if 'small' in eachpicurl:
eachpicurl = eachpicurl.replace('small', 'large')
if 'thumb150' in eachpicurl:
eachpicurl = eachpicurl.replace('thumb150', 'large')
if eachpicurl in ['http://www.jingpinjuhe.com/wp-content/themes/xiu/images/logo.png']:#如果是网站图标则忽略
continue
picname = os.path.split(eachpicurl)[1]
try:
#导出图片的二进制数据
picdata = bhtml(eachpicurl)
except:
print(eachpicurl, '下载有问题')
passurllist.append(eachpicurl)
with open('passurl.txt', 'wb') as f1:
pickle.dump(passurllist, f1)
continue
else:
if 'gif' not in picname:#不下载gif图片
with open(picname, 'wb') as f:
f.write(picdata)
time.sleep(0.5)

def main():
try:#新建一个文件夹,如果已经存在就pass
os.mkdir(r'D:\download\pic\new')
except:
pass
os.chdir(r'D:\download\pic\new') #改变工作目录
url = r'http://www.jingpinjuhe.com/?cat=26&paged=12' #最后一页
pageurl = getpage()#所有页码的地址
for x in pageurl:
print(x)
pagesoup = data(x)#每个页面的soup
itemaddress = getitempage(pagesoup)
for itemurl in itemaddress:
if itemurl in [r'http://www.jingpinjuhe.com/?p=2965']:
continue
itemsoup = data(itemurl)
downpic(itemsoup)
print('下载完毕.')



if __name__ == '__main__':
main()

只是下载完后发现有一部分地址名后面没有图片的后缀,这个问题也很简单,把所有没有后缀的图片文件放在一个文件夹中,使用os.walk(path)导出所有文件名,然后用os.rename(oldname, oldname+’jpg’)就解决问题了。