requests爬取斗图网存入dict案例

论坛 期权论坛 脚本     
匿名技术用户   2020-12-23 07:59   114   0
import requests,os
import re,random,threading
import json,time


# class myThread (threading.Thread):   #继承父类threading.Thread
#     def __init__(self):
#         threading.Thread.__init__(self)

#     def run(self):                  
#         CrawListPage(self.url, self.newdir,self.CrawledURLs)



class DoutuCrawl:
    # def __init__(self):
    #     self.locationLink ='https://www.doutula.com/article/list/'     

    # def concatLink(self):
    #     self.Htmlurl = self.locationLink +'search?type=photo&more=1&keyword='+self.keyword+'&page=1'
    #     ResponeHmtl,Maxpage = self.GetHtml(self.Htmlurl)
    #     infoList= []
    #     for i in range(Maxpage):
    #         self.Htmlurl = self.locationLink +'search?type=photo&more=1&keyword='+self.keyword+'&page='+str(i)
    #         infoList.append(self.Htmlurl)
    #     self.GetImage(infoList)
    def GetTaoTu(self):
        global lock
        temp_list = []
        count = 0
        WebUrl = 'https://www.doutula.com/article/list/'
        ResponeHmtl,Maxpage = self.GetHtml(WebUrl)
        # Maxpage
        for i in range(Maxpage):
            url = 'https://www.doutula.com/article/list/?page='+str(i)
            html,tt = self.GetHtml(url)
            pattern = re.compile(r'https://www.doutula.com/article/detail/\d+')
            urlS = pattern.findall(html)
            for i in range(len(urlS)):
                          
                self.searcher(urlS[i],temp_list,count)
                
                count+=1
        print(temp_list)
    def searcher(self,urls,temp_list,count):
        count = count+1
        data = {}
        Tink = self.GetHtml(urls)
        title = re.search(r'<h1><a href=".*?">(.*?)</a></h1>',Tink).group(1)
        update = re.search(r'<span class="glyphicon glyphicon-time">(.*?)</span>',Tink).group(1)
        imagpatter = re.compile(r'"this.src=(.*?)">')
        imgas = imagpatter.findall(Tink)
        data['title']= title
        data['updata']=update
        data['coverimg'] = imgas[0]
        data['coverlist']=imgas
        temp_list.append(data)
        print('第{}页加入列表,标题是{}'.format(count,title))
    # def GetImage(self,infoList):
    #     imglist = []
    #     titleList = [] 
    #     timelist = []
    #     tmp_list=[]
    #     for i in range(len(infoList)):
            
    #         try:
    #             imageurl = infoList[i+1]
    #             imAgeHmtl= self.GetHtml(imageurl)
    #             iMg = re.compile('data-original="(.*?)"')
    #             iMgAll=re.findall(iMg,imAgeHmtl[0])
    #             for i in range(len(iMgAll)):
    #                 imglist.append(iMgAll[i])
    #                 self.savaimg(iMgAll[i])
    #             title = re.compile('<p style="display: none">(.*?)</p>')
    #             tileAll = re.findall(title,imAgeHmtl[0])
    #             for i in range(len(tileAll)):
    #                 titleList.append(tileAll[i])
    #                 timelist.append(time.strftime("%Y-%m-%d ", time.localtime()) )

    #         except:
    #             print("error")
    #         alllist = zip(imglist,titleList,timelist)
    #         for each,i,n in alllist:
    #             data = {}
    #             data['coverimg'] = each
    #             data['title'] = i
    #             data['updata'] =time.strftime("%Y-%m-%d ", time.localtime())
    #             tmp_list.append(data)

    #         print(tmp_list)

    # def savaimg(self,imgurl):
    #     path = os.getcwd()
    #     general = path+"\\Enrichment\\"  # 总目录
    #     path1=general+imgurl.split("/")[-1]
    #     if not os.path.exists(general):
    #         os.mkdir(general)
    #     writeimag = requests.get(imgurl)
    #     writeimag1= writeimag.content
    #     with open(path1, 'wb') as f:
    #         f.write(writeimag1)
    #         f.close()
    def GetHtml(self,Htmlurl):
        headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
                'Host': 'www.doutula.com',
                "Upgrade-Insecure-Requests":'1',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'                                   
                }
        proxies = ["115.218.222.64:9000", "120.194.18.90:81", "123.160.74.11:9999"]
        try:
            request = requests.session()
            
            response = request.get(Htmlurl, timeout=30,  proxies={'http': random.choice(proxies)},headers=headers)
            response.encoding = 'utf-8'
            # print(response.apparent_encoding)
            ResponeHmtl = response.text
            pages = re.compile(r'<li class="page-item"><a class="page-link(.*?)</a></li>') 
            ResponeHmtl = response.text
            pages = re.findall(pages,ResponeHmtl)[-1]
            Maxpage = int(re.search(r'>(\d+)',pages).group(1)  )     
            return ResponeHmtl,Maxpage

        except:
            return ResponeHmtl


lock = threading.Lock() 
Doutu = DoutuCrawl()
# Doutu.GetTaoTu()
for k in range(3):
    new_thread = threading.Thread(target=Doutu.GetTaoTu(),args=(k,))   # 创建线程; Python使用threading.Thread对象来代表线程
    new_thread.start()   
    t1.join()
分享到 :
0 人收藏
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

积分:7942463
帖子:1588486
精华:0
期权论坛 期权论坛
发布
内容

下载期权论坛手机APP