爬虫-下载上万张煎蛋网旧图片

继上一篇从煎蛋网下载妹子图片后，在网上发现另外两个途径可以下载到煎蛋网往年的旧图片。第一个是个网页，作者把所有图片地址写入一个网页中，没有一个文字只有图片有五千多张。第二个是一个本地html文件，也是一样用浏览器打开后全是图片，有六千多张。据我后面看着两个地方的图片还没有发现重复的。全部下载完，除去一些失效的图片链接，有上万张。

总的来说，写这个爬虫比较简单，没有复杂的标签，没有分页码，简单粗暴方法就可以应付了。

思路是：1. 用requests得到数据后，分析出所有图片的地址，写入一个allurl.txt文件中。2.从allurl.txt文件中读取图片地址，并下载，下载不成功的地址写入passurl.txt, 成功的地址写入okurl.txt中。3. 把第2步中passurl.txt该名为allurl.txt，删除okurl.txt和旧的allurl.txt并打开代理或者其他重复第二步，直到没有更多的地址可以下载。

一. 从网页上获取数据写入allurl.txt

import requests
import chardet
from bs4 import BeautifulSoup
import os
import pickle

#页面的soup数据
def  data(url):
    agent = 'User-Agent'
    agentvalue = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
    headers = {agent:agentvalue}
    #proxy = {'http':'120.78.174.170:8080'} , proxies=proxy
    response = requests.get(url=url, headers=headers, timeout = 5)
    global code
    code = chardet.detect(response.content)['encoding']
    response.encoding = code
    html = response.text
    #print(html)
    soup = BeautifulSoup(html, 'html.parser')
    return soup

#页面所有图片的地址列表
def getpicurl(soup):
    list1 = soup.find_all('img')
    for eachurl in list1:
        picurl = eachurl['src']
        yield picurl

#主程序会生成一个包含有所有图片下载地址的allurl.txt的二进制文件
def main():        
    allurllist = []
    try:#新建一个文件夹，如果已经存在就pass
        os.mkdir('D:\download\pic')
    except:
        pass
    os.chdir(r'D:\download\pic') #改变工作目录

    url = r'http://js.funet8.com/html/jiandan-meizhi.html' #网站地址
    soupdata = data(url) #网站的soup数据
    pictureurl = getpicurl(soupdata) #生成器产生的每张图片的地址
    for eachpicurl in pictureurl:
        #对地址进行删选。由于网页代码的一些问题，导致地址出现一些错误
        if '<img src=' in eachpicurl:
            continue
        if ' />' in eachpicurl:
            eachpicurl = eachpicurl.replace(' />', '')
        #修改一下可以得到大图的地址
        if 'sinaimg' in eachpicurl:
            if 'bmiddle' in eachpicurl:
                eachpicurl = eachpicurl.replace('bmiddle', 'large')
            if 'mw690' in eachpicurl:
                eachpicurl = eachpicurl.replace('mw690', 'large')
            if 'mw600' in eachpicurl:
                eachpicurl = eachpicurl.replace('mw600', 'large')
            if 'thumbnail' in eachpicurl:
                eachpicurl = eachpicurl.replace('thumbnail', 'large')
            if 'small' in eachpicurl:
                eachpicurl = eachpicurl.replace('small', 'large')
            if 'thumb150' in eachpicurl:
                eachpicurl = eachpicurl.replace('thumb150', 'large')
        #把所有地址组成的列表写入文件
        allurllist.append(eachpicurl)
        with open('allurl.txt', 'wb') as f2:
            pickle.dump(allurllist, f2)


if __name__ == '__main__':
    main()

二.从本地html文件上获取数据

import requests
import chardet
from bs4 import BeautifulSoup
import os
import pickle

#页面的soup数据
def  data(url):
    agent = 'User-Agent'
    agentvalue = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
    headers = {agent:agentvalue}
    #proxy = {'http':'120.78.174.170:8080'} , proxies=proxy
    
    response = requests.get(url=url, headers=headers, timeout = 5)
    global code
    code = chardet.detect(response.content)['encoding']
    response.encoding = code
    html = response.text
    #print(html)
    soup = BeautifulSoup(html, 'html.parser')
    return soup

#页面所有图片的地址列表
def getpicurl(soup):
    list1 = soup.find_all('img')
    for eachurl in list1:
        picurl = eachurl['src']
        yield picurl   

def main():
    allurllist = []

    try:#新建一个文件夹，如果已经存在就pass
        os.mkdir('D:\download\pic')
    except:
        pass
    os.chdir(r'D:\download\pic') #改变工作目录

    url = r'E:\python files\IDEL FILES\My py file\爬虫煎蛋网旧图片\jiandan.html' #文件保存地址，需要根据实际情况变化

    #读取本地html文件，注意open的参数需要加上encoding的参数，否则会出现编码错误，至于编码方法可以用chardet测试
    #得到的f是一个迭代器，迭代出的是包含每个图片地址标签的字符串，如果我们直接对这个标签字符串进行操作会很麻烦，所以用BeautifulSoup进行解析
    with open(url, mode='r', encoding='utf-8') as f:
        soupdata = BeautifulSoup(f, 'html.parser')
    pictureurl = getpicurl(soupdata)
    for eachpicurl in pictureurl:
        if '<img src=' in eachpicurl:
            continue
        if ' />' in eachpicurl:
            eachpicurl = eachpicurl.replace(' />', '')
        if 'sinaimg' in eachpicurl:
            if 'bmiddle' in eachpicurl:
                eachpicurl = eachpicurl.replace('bmiddle', 'large')
            if 'mw690' in eachpicurl:
                eachpicurl = eachpicurl.replace('mw690', 'large')
            if 'mw600' in eachpicurl:
                eachpicurl = eachpicurl.replace('mw600', 'large')
            if 'thumbnail' in eachpicurl:
                eachpicurl = eachpicurl.replace('thumbnail', 'large')
            if 'small' in eachpicurl:
                eachpicurl = eachpicurl.replace('small', 'large')
            if 'thumb150' in eachpicurl:
                eachpicurl = eachpicurl.replace('thumb150', 'large')
        allurllist.append(eachpicurl)
        with open('allurl.txt', 'wb') as f2:
            pickle.dump(allurllist, f2)

if __name__ == '__main__':
    main()

三. 从allurl.txt读取数据下载图片

import requests
import chardet
from bs4 import BeautifulSoup
import os.path
import os
import pickle


#定义函数返回地址二进制数据内容
def bhtml(url):
    agent = 'User-Agent'
    agentvalue = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
    headers = {agent:agentvalue}
    #proxy = {'http':'120.78.174.170:8080'} , proxies=proxy    
    response = requests.get(url=url, headers=headers, timeout = 5)
    bhtml = response.content
    return bhtml

#页面所有图片的地址列表
def getpicurl(soup):
    list1 = soup.find_all('img')
    for eachurl in list1:
        picurl = eachurl['src']
        yield picurl

#从allurl.txt读取数据并下载图片        
def main():
    try:#新建一个文件夹，如果已经存在就pass
        os.mkdir('D:\download\pic')
    except:
        pass
    os.chdir(r'D:\download\pic') #改变工作目录

    #如果第一次运行，就直接从allurl.txt读取数据, passurl和okurl都为空。
    #如果中途因为各种原因停止了程序，那么从allurl中删除已经下载的和没有下载成功的地址，这样可以避免重启时重复下载图片。
    try:
        with open(r'D:\download\pic\passurl.txt', 'rb') as psurl:
            passurllist = pickle.load(psurl)
        with open(r'D:\download\pic\okurl.txt', 'rb') as goodurl:
            okurllist = pickle.load(goodurl)
        with open(r'D:\download\pic\allurl.txt', 'rb') as totalurl:
            allurllist = pickle.load(totalurl)
    except:
        passurllist = []
        okurllist = []
        with open(r'D:\download\pic\allurl.txt', 'rb') as totalurl:
            allurllist = pickle.load(totalurl)
    else:
        newalllist = [x for x in allurllist if x not in passurllist and x not in okurllist]
        with open(r'D:\download\pic\allurl.txt', 'wb') as total:
            pickle.dump(newalllist, total)
    #遇到读取数据时出现问题就把地址保存在passurl里面，下载完成的保存在okurl里面。
    for eachpicurl in allurllist:
        picname = os.path.split(eachpicurl)[1]
        try:
            picdata = bhtml(eachpicurl)
        except:
            print(eachpicurl, '下载有问题')
            passurllist.append(eachpicurl)
            with open('passurl.txt', 'wb') as f1:
                pickle.dump(passurllist, f1)
            continue
        else:
            try:
                with open(picname, 'wb') as f:
                    f.write(picdata)
            except:
                print(eachpicurl, '下载有问题')
                passurllist.append(eachpicurl)
                with open('passurl.txt', 'wb') as f4:
                    pickle.dump(passurllist, f4)
                continue
            else: 
                print(eachpicurl, '下载ok')
                okurllist.append(eachpicurl)
                with open('okurl.txt', 'wb') as f3:
                    pickle.dump(okurllist, f3)

if __name__ == '__main__':
    main()

总结：

从本地html文件读取数据的方法
对异常的处理