大家在学习一段时间后可以尝试自建数据集,本源码可从百度图片网站根据关键字和数量批量采集图片(也可从源码内更换采集的目标网址)。功能已在源码内注释,可帮助大家提高采集图片数据的效率。
import re
import uuid
import requests
import os
class DownloadImages:
def __init__(self, download_max, all_class, key_word):
self.download_sum = 0
self.download_max = int(download_max ) #设置每个关键词图片的下载数量
self.key_word = key_word
self.save_path = '../images/%s/%s' % (all_class, key_word)
def start_download(self):
self.download_sum = 0
gsm = 80
str_gsm = str(gsm)
pn = 0
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
while self.download_sum < self.download_max:
str_pn = str(self.download_sum)
#爬取的目标网址,注意去掉\后的空格字符
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&' \
'word=' + self.key_word + '&pn=' + str_pn + '&gsm=' + str_gsm + \
'&ct=&ic=0&lm=-1&width=0&height=0'
print (url)
result = requests.get(url)
self.downloadImages(result.text)
print ("下载完成")
def downloadImages(self, html):
img_urls = re.findall('"objURL":"(.*?)",', html, re.S)
print ('找到关键词:' + self.key_word + '的图片,现在开始下载图片...')
for img_url in img_urls:
print ('正在下载第' + str(self.download_sum + 1) + '张图片,图片地址:' + str(img_url))
try:
pic = requests.get(img_url, timeout=50)
pic_name = self.save_path + '/' + str(uuid.uuid1()) + '.' + str(img_url).split('.')[-1]
with open(pic_name, 'wb') as f:
f.write(pic.content)
self.download_sum += 1
if self.download_sum >= self.download_max:
break
except :
print ('【错误】当前图片无法下载,%s' )
continue
if __name__ == '__main__':
all_class = input ('请输入你要下载总类别名称:')
key_word_max = input('请输入你要下载几个类别:')
key_words = [ ]
for sum in range (int(key_word_max)): #设置关键词数量
key_words.append(input('请输入第%s个关键字:' % str(sum + 1)))
max_sum = input('请输入每个类别下载的数量:')
for key_word in key_words:
downloadImages = DownloadImages(max_sum, all_class, key_word)
downloadImages.start_download()
收藏
点赞
0
个赞
请登录后评论
TOP
切换版块