python3 爬取百度图片--指定分辨率

插件安装

1
2
3
pip install beautifulsoup4
pip install requests
pip install  lxml 

代码

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
import requests
from urllib import error
from bs4 import BeautifulSoup
import os
import json
 
num = 0
numPicture = 0
file = ''
List = []

global num_per_page 
# 每个页面的图片数
num_per_page=20
 
 
def Find(url):
    global List
    print('正在检测图片总数.....')
    t = 0
    i = 1
    s = 0
    while t < 2000:
        Url = url + str(t)
        try:
            Result = requests.get(Url, timeout=7)
        except BaseException:
            t = t + num_per_page
            continue
        else:
            result = Result.text
            pic_url = re.findall('"objURL":"(.*?)",', result, re.S)  # 先利用正则表达式找到图片url
            s += len(pic_url)
            if len(pic_url) == 0:
                break
            else:
                List.append(pic_url)
                t = t + num_per_page
    return s
 
 
def recommend(url):
    Re = []
    try:
        html = requests.get(url)
    except error.HTTPError as e:
        return
    else:
        html.encoding = 'utf-8'
        bsObj = BeautifulSoup(html.text, 'html.parser')
        div = bsObj.find('div', id='topRS')
        if div is not None:
            listA = div.findAll('a')
            for i in listA:
                if i is not None:
                    Re.append(i.get_text())
        return Re
 
 
def dowmloadPicture(html, keyword ,file):
    global num
    # t =0
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # 先利用正则表达式找到图片url
    print('找到关键词:' + keyword + '的图片,即将开始下载图片...')
    for each in pic_url:
        print('正在下载第' + str(num + 1) + '张图片,图片地址:' + str(each))
        try:
            if each is not None:
                pic = requests.get(each, timeout=7)
            else:
                continue
        except BaseException:
            print('错误,当前图片无法下载')
            continue
        else:
            string = file + '/' + keyword + '_' + str(num) + '.jpg'
            fp = open(string, 'wb')
            fp.write(pic.content)
            fp.close()
            num += 1
        if num >= numPicture:
            return
 
 
if __name__ == '__main__':  # 主函数入口
    t = 0
    word = input("请输入搜索关键词 (最好中文关键字): ")
    height = input("输入照片的高度(若需要指定,直接回车跳过): ")
    width = input("输入照片宽度 (若需要指定,直接回车跳过): ")
    #url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+word+'&pn='+str(t)+'&gsm=50&ct=&ic=0&lm=-1&width='+width+'&height='+height
    #Recommend = recommend(url)  # 记录相关推荐
    #tot = Find(url)
    #print('经过检测%s类图片共有%d张' % (word, tot))
    numPicture = int(input('请输入想要下载的图片数量 :'))
    file = input('请建立一个存储图片的文件夹,输入文件夹名称即可 : ')
    y = os.path.exists(file)
    if y == 1:
        print('该文件已存在,请重新输入')
        file = input('请建立一个存储图片的文件夹,)输入文件夹名称即可')
        os.mkdir(file)
    else:
        os.mkdir(file)
    #tmp = url
    while t < numPicture:
        try:
            #url = tmp #+ str(t)
            url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+word+'&pn='+str(t)+'&gsm=50&ct=&ic=0&lm=-1&width='+width+'&height='+height
            result = requests.get(url, timeout=20)
            print(url)
        except error.HTTPError as e:
            print('网络错误,请调整网络后重试')
            t = t + num_per_page
        else:
            dowmloadPicture(result.text, word ,file)
            t = t + num_per_page
 
    print('当前搜索结束。')
    '''
    print('相似关键词: ')
    for re in Recommend:
        print(re, end='  ')
       ''' 
updatedupdated2020-03-132020-03-13