import requests,os
import re,random,threading
import json,time
class DoutuCrawl:
def GetTaoTu(self):
global lock
temp_list = []
count = 0
WebUrl = 'https://www.doutula.com/article/list/'
ResponeHmtl,Maxpage = self.GetHtml(WebUrl)
for i in range(Maxpage):
url = 'https://www.doutula.com/article/list/?page='+str(i)
html,tt = self.GetHtml(url)
pattern = re.compile(r'https://www.doutula.com/article/detail/\d+')
urlS = pattern.findall(html)
for i in range(len(urlS)):
self.searcher(urlS[i],temp_list,count)
count+=1
print(temp_list)
def searcher(self,urls,temp_list,count):
count = count+1
data = {}
Tink = self.GetHtml(urls)
title = re.search(r'<h1><a href=".*?">(.*?)</a></h1>',Tink).group(1)
update = re.search(r'<span class="glyphicon glyphicon-time">(.*?)</span>',Tink).group(1)
imagpatter = re.compile(r'"this.src=(.*?)">')
imgas = imagpatter.findall(Tink)
data['title']= title
data['updata']=update
data['coverimg'] = imgas[0]
data['coverlist']=imgas
temp_list.append(data)
print('第{}页加入列表,标题是{}'.format(count,title))
def GetHtml(self,Htmlurl):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
'Host': 'www.doutula.com',
"Upgrade-Insecure-Requests":'1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
proxies = ["115.218.222.64:9000", "120.194.18.90:81", "123.160.74.11:9999"]
try:
request = requests.session()
response = request.get(Htmlurl, timeout=30, proxies={'http': random.choice(proxies)},headers=headers)
response.encoding = 'utf-8'
ResponeHmtl = response.text
pages = re.compile(r'<li class="page-item"><a class="page-link(.*?)</a></li>')
ResponeHmtl = response.text
pages = re.findall(pages,ResponeHmtl)[-1]
Maxpage = int(re.search(r'>(\d+)',pages).group(1) )
return ResponeHmtl,Maxpage
except:
return ResponeHmtl
lock = threading.Lock()
Doutu = DoutuCrawl()
for k in range(3):
new_thread = threading.Thread(target=Doutu.GetTaoTu(),args=(k,))
new_thread.start()
t1.join()
|