python web编程之反爬机制绕过
现在的网站都添加了相应的反爬取机制,刚开始的几次是可以成功的,但是之后脚本就无法接收到数据了,发现网站上多了一个输入验证码的环节
#mechanize库中的用法 br.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36')] #常见的添加定制的 headers(示例如下) headers ={ "Host": "www.cmd5.com", "Content-Length": "1832", "Cache-Control": "max-age=0", "Origin": "http://www.cmd5.com", "Upgrade-Insecure-Requests": "1", "Content-Type": "application/x-www-form-urlencoded", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Referer": "http://www.cmd5.com/", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cookie": "FirstVisit=2017/10/14 16:49:39; ASP.NET_SessionId=4240erfxxgel3450n4dgddej; comefrom=https://www.baidu.com/link?url=_iyok742ki838ontfqnni8s-yikrus241ocxk3cplqo&wd=&eqid=ed2c528f0003fd1a000000055b18de2e; Hm_lvt_0b7ba6c81309fff7ce4498ec7b107c0b=1528302253,1528328811,1528356400; Hm_lpvt_0b7ba6c81309fff7ce4498ec7b107c0b=1528356400", "Connection": "close" }
#使用request库加代理参数的方法 import requests proxy = {'HTTPS': '117.85.105.170:808','HTTP':'117.85.105.170:808'} head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'Connection': 'keep-alive'} p = requests.get('http://icanhazip.com', headers=head, proxies=proxy) print(p.text)
#获取代理IP列表的简单尝试 #encoding=utf8 import urllib2 import BeautifulSoup User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' header = {} header['User-Agent'] = User_Agent url = 'http://www.xicidaili.com/nn/1' req = urllib2.Request(url,headers=header) res = urllib2.urlopen(req).read() soup = BeautifulSoup.BeautifulSoup(res) ips = soup.findAll('tr') f = [] for x in range(1,len(ips)): ip = ips[x] tds = ip.findAll("td") ip_temp = tds[1].contents[0]+"\t"+tds[2].contents[0]+"\n" # print tds[2].contents[0]+"\t"+tds[3].contents[0] f.append(ip_temp)
#网上的一个爬段子的代码,我自己改了改,这个比较典型,包含了ip代理和请求头的不断更换,实现欺骗服务器的目的 # encoding=utf8 #网站反爬虫:一个IP频繁访问就先将该IP加入黑名单 #反爬虫策略:限制IP访问频率,超过频率就自动断开:降低爬虫的速度,在每个请求前加time.sleep,或更换IP #策略二:后台对访问进行统计,如果单个userAgent访问超过阈值,予以封锁:误伤较大,一般网站不使用 #策略三:针对cookies:一般网站不使用 import requests import re import urllib2 import random import time from bs4 import BeautifulSoup #首先,我们找一个发布代理IP的网站,从该网站爬取代理IP来访问网页,当本地IP失效,启用代理IP class download(object): def __init__(self): self.ip_list=[] #初始化列表用来存储获取到的IP User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' header = {} header['User-Agent'] = User_Agent html = requests.get("http://www.xicidaili.com/nn/1", headers=header) response = html.text soup = BeautifulSoup(response,'lxml') ips = soup.findAll('tr') for x in range(1, len(ips)): ip = ips[x] tds = ip.findAll("td") ip_temp = tds[1].contents[0] + ":" + tds[2].contents[0] + "\n" # print tds[2].contents[0]+"\t"+tds[3].contents[0] self.ip_list.append(ip_temp) print self.ip_list self.user_agent_list=[ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] def get(self,url,timeout,proxy=None,num_retries=6): ua=random.choice(self.user_agent_list) #从user_agent_list中随机抽取出一个字符串 # print(ua) header={"User-Agent":ua} #构造一个完整的User_Agent if proxy==None: #当代理为空时,不使用代理获取response try: response=requests.get(url,headers=header,timeout=timeout) return response except: if num_retries>0: time.sleep(10) print(u"获取网页错误,10s后将获取倒数第:",num_retries,u"次") return self.get(url,timeout,num_retries-1) #调用自身并将次数减1 else: print(u"开始使用代理") time.sleep(10) IP="".join(str(random.choice(self.ip_list)).strip()) proxy={"http":IP} return self.get(url,timeout,proxy) else: try: IP="".join(str(random.choice(self.ip_list)).strip()) #随机取IP并去除空格 proxy={"http":IP} #构造一个代理 response=requests.get(url,headers=header,proxies=proxy,timeout=timeout) #使用代理来获取response return response except: if num_retries>0: time.sleep(10) IP="".join(str(random.choice(self.ip_list)).strip()) print(u"正在更换代理,10s后将重新获取第",num_retries,u"次") print(u"当前代理是:",proxy) return self.get(url,timeout,proxy,num_retries-1) else: print(u"代理发生错误,取消代理") return self.get(url,3) request=download() def qsbk(url): # header={ # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', # 'Accept-Encoding': 'gzip, deflate, sdch', # 'Accept-Language': 'zh-CN,zh;q=0.8', # 'Cache-Control': 'max-age=0', # 'Connection': 'keep-alive', # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235' # } # rep=requests.get(url,headers=header) # html=rep.text # bs=BeautifulSoup(html,"html.parser") # body=bs.body #获取html文件的body部分 # data=body.find_all("div",{"class":"content"}) #此时的他为set 类型 # for joke in data: # joke_duan=joke.find("span") # if "<br/>" not in str(joke_duan): #如果段子中有<br/>,则string会变为None # print(joke_duan.string) # print("") # # with open("joke.txt","w") as f: # # f.write(joke_duan.string) html=request.get(url,3) dz=BeautifulSoup(html.text,"html.parser").find_all("div",{"class":"content"}) #获取一个集合 # print(dz) # print(len(dz)) for joke in dz: #joke为一段html代码 duanzi=joke.get_text() print(duanzi) if __name__=="__main__": url="http://www.qiushibaike.com/" qsbk(url)
#!/usr/bin/python3 import time time.sleep( 5 )
#这里提供一个没有反爬机制的网站的方法 #使用mechanize库,模拟浏览器的行为 #首先要知道对这个网站的请求头,可以通过BurpSuite抓一个 # _*_ coding:utf-8 _*_ import requests import mechanize import re flagmd5 = '762306AB890905CFF6887D5A75776382' def web_md5(md5_string): br = mechanize.Browser() # 设置是否处理HTML http-equiv标头 br.set_handle_equiv(True) # 设置是否处理重定向 br.set_handle_redirect(True) # 设置是否向每个请求添加referer头 br.set_handle_referer(True) # 设置是不遵守robots中的规则 br.set_handle_robots(False) # 处理giz传输编码 br.set_handle_gzip(False) # 设置浏览器的头部信息 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36')] br.open('http://pmd5.com/') #for form in br.forms(): # print form #如果知道表单结构这里就可以注释掉 br.select_form(name = "formMd5") br.form['key'] = md5_string br.submit() page = br.response().read() pattern = "<em>.{4}</em>”!</p></div>" flag = re.findall(pattern,page,flags=0) print flag if flag: print flag[0][4:8] print page web_md5(flagmd5)
pip install Selenium
下载安装包,添加环境变量
高版本的Selenium不再支持PhantomFS
https://blog.csdn.net/u010358168/article/details/79749149
https://blog.csdn.net/qq_30242609/article/details/79323963
本版积分规则 发表回复 回帖并转播 回帖后跳转到最后一页
QQ咨询|关于我们|Archiver|手机版|小黑屋|( 辽ICP备15012455号-4 ) Powered by 期权论坛 X3.2 © 2001-2016 期权工具网&期权论坛 Inc.
下载期权论坛手机APP