|
1 #2019-11-23
2 importrequests3 importtime4 import re #Python正则表达式库
5
6 if __name__=='__main__':7 #海量爬取图片数据
8 #进入网站(一般商业图片素材公司网站版权保护做得比较好,不容易爬取)
9 #https://www.pexels.com/(该网站图片免费,易于爬取)
10 #搜索关键词,Chrome按下F12查看源码,发现图片链接
11 url_picture='https://www.pexels.com/search/man/'
12 response=requests.get(url=url_picture)13 with open('./pexels/man.html',mode='w',encoding='utf-8') as fp:14 fp.write(response.text)15 print('网页保存成功!') #保存的html文件中含有多张图片的url地址
16
17 #所有的数据
18 #
19 #https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&dpr=2&w=500 2x"
20 #class="photo-item__img" alt="Man Smiling Behind Wall" data-image-width="3476" data-image-height="5214"
21 #data-big-src="https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&h=750&w=1260"
22 #data-large-src="https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&h=650&w=940"
23 #data-tiny-src="https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
24 #data-tiny-srcset="https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500 1x,
25 #https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&dpr=2&w=500 2x"
26 #src="https://images.pexels.com/photos/220453/pexels-photo-220453.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500" >
27
28 #该正则获取小括号内内容 (.*?) .表示任意字符,*表示匹配多个,?表示遇到"就停下来(非贪婪模式)
29 num_name=1
30 html=response.text31 pattern_url=r'' #r'':非转义的原始字符串
32 pattern_img_name=r'pexels-photo-(.*?).jpeg'
33 img_urls=re.findall(pattern_url,html) #得到的是一个list,里面是str元素,这些元素是匹配到的图片url
34 print(img_urls)35 for img_url inimg_urls:36 response=requests.get(img_url)37 content=response.content38 #img_name=re.findall(pattern_img_name,img_url) #该网站srcset内有两条可用的url,所以匹配后会有两条相同的name
39 with open('./Pexels/'+str(num_name)+'.jpg','wb') as fp:40 fp.write(content)41 print(str(num_name)+'号图片下载成功!')42 num_name+=1
43 time.sleep(0.1) #设置时间延迟 1s
44
45 #Python 文件读写
46 #open(文件地址,读写方式,编码方式),
47 #读写方式:
48 #文本'w'
49 #图片'wb'
50 |