# 按CSS class 匹配
month=soup.find_all('li',{"class":"month"})
for m in month:
print(m.get_text())
一月
二月
三月
四月
五月
jan=soup.find('ul',{'class':'jan'})
d_jan=jan.find_all('li')
for d in d_jan:
print(d.get_text())
一月一号
一月二号
一月三号
BeautifulSoup解析网页: 正则表达
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
html=urlopen(
"https://morvanzhou.github.io/static/scraping/table.html"
).read().decode('utf-8')
soup=BeautifulSoup(html,features='lxml')
img_links=soup.find_all('img',{'src':re.compile('.*?\.jpg')})
for link in img_links:
print(link['src'])
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import random
# 设置起始页,并将/item/...的网页放在his中,做一个备案,记录我们浏览过的网页
base_url="https://baike.baidu.com"
his=["/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"]
# select the last sub url i 'his', print the title and url
url=base_url+his[-1]
html=urlopen(url).read().decode('utf-8')
soup=BeautifulSoup(html,features='lxml')
print(soup.find('h1').get_text(),' url: ',his[-1])
# find valid urls
sub_urls=soup.find_all('a',{'target':'_blank','href':re.compile('/item/(%.{2})+$')})
if len(sub_urls)!=0:
his.append(random.sample(sub_urls,1)[0]['href'])
else:
# no valid sub link found
his.pop()
print(his)
<h2>Welcome to the Website!</h2>
You have logged in successfully! <br><a href="profile.php">Check out your profile!</a>
下载文件
import os
os.makedirs('./images/',exist_ok=True) #新建一个文件夹
IMAGE_URL='https://morvanzhou.github.io/static/img/description/learning_step_flowchart.png'
使用urlretrieve
from urllib.request import urlretrieve #urllib模块中提供了一个下载功能urlretrieve
urlretrieve(IMAGE_URL,'./images/Image1.png')
('./images/Image1.png', <http.client.HTTPMessage at 0x7fb8c834c0f0>)
使用request
import requests
r=requests.get(IMAGE_URL)
with open('./images/Image2.png','wb') as f:
f.write(r.content)
r = requests.get(IMAGE_URL, stream=True) # stream loadingwith open('./images/Image3.png', 'wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)
小练习: 下载美图
from bs4 import BeautifulSoup
import requests
URL = "http://www.nationalgeographic.com.cn/animals/"
html = requests.get(URL).text
soup = BeautifulSoup(html, 'lxml')
img_ul = soup.find_all('ul', {"class": "img_list"})
for ul in img_ul:
imgs = ul.find_all('img')
for img in imgs:
url = img['src']
r = requests.get(url, stream=True)
image_name = url.split('/')[-1]
with open('./images/%s' % image_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=128):
f.write(chunk)
print('Saved %s' % image_name)
import multiprocessing as mp
import time
from urllib.request import urlopen,urljoin
from bs4 import BeautifulSoup
import re
base_url = 'https://morvanzhou.github.io/'defcrawl(url):
response=urlopen(url)
time.sleep(0.1)
return response.read().decode()
defparse(html):
soup=BeautifulSoup(html,'lxml')
urls=soup.find_all('a',{'href':re.compile('^/.+?/$')})
title=soup.find('h1').get_text().strip()
page_urls=set([urljoin(base_url,url['href']) for url in urls])
url=soup.find('meta',{'property':'og:url'})['content']
return title,page_urls,url
测试普通爬法
unseen = set([base_url,])
seen = set()
if base_url != "http://127.0.0.1:4000/":
restricted_crawl = Trueelse:
restricted_crawl = False
count, t1 = 1, time.time()
while len(unseen) != 0: # still get some url to visitif restricted_crawl and len(seen) > 20:
break
print('\nDistributed Crawling...')
htmls = [crawl(url) for url in unseen]
print('\nDistributed Parsing...')
results = [parse(html) for html in htmls]
print('\nAnalysing...')
seen.update(unseen) # seen the crawled
unseen.clear() # nothing unseenfor title, page_urls, url in results:
print(count, title, url)
count += 1
unseen.update(page_urls - seen) # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, )) # 53 s
unseen = set([base_url,])
seen = set()
pool = mp.Pool(4)
count, t1 = 1, time.time()
while len(unseen) != 0: # still get some url to visitif restricted_crawl and len(seen) > 20:
break
print('\nDistributed Crawling...')
crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
htmls = [j.get() for j in crawl_jobs] # request connection
print('\nDistributed Parsing...')
parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
results = [j.get() for j in parse_jobs] # parse html
print('\nAnalysing...')
seen.update(unseen) # seen the crawled
unseen.clear() # nothing unseenfor title, page_urls, url in results:
print(count, title, url)
count += 1
unseen.update(page_urls - seen) # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, )) # 16 s !!!
# asyncio: 在单线程里使用异步计算,下载网页的时候和处理网页的时候是不连续的# 更有效利用了等待下载的这段时间import time
defjob(t):
print('Start job',t)
time.sleep(t)
print('Job',t,' takes',t,'s')
defmain():
[job(t) for t in range(1,3)]
t1=time.time()
main()
print('No async total time: ',time.time()-t1)
Start job 1
Job 1 takes 1 s
Start job 2
Job 2 takes 2 s
No async total time: 3.00662899017334
import asyncio
async defjob(t):#async形式的功能
print('Start job',t)
await asyncio.sleep(t) #等待t秒,期间切换其他任务
print('Job',t,' takes',t,'s')
async defmain(loop):#async形式的功能
tasks=[loop.create_task(job(t)) for t in range(1,3)] #创建任务,但不执行
await asyncio.wait(tasks) #执行并等待所有任务完成
t1=time.time()
loop=asyncio.get_event_loop() #建立loop
loop.run_until_complete(main(loop)) #执行loop
print("Async total time:", time.time()-t1)
Start job 1
Start job 2
Job 1 takes 1 s
Job 2 takes 2 s
Async total time: 2.0041818618774414
aiohttp
$ pip3 install aiohttp
import requests
URL = 'https://morvanzhou.github.io/'defnormal():for i in range(2):
r=requests.get(URL)
url=r.url
print(url)
t1=time.time()
normal()
print('Normal total time:',time.time()-t1)
https://morvanzhou.github.io/
https://morvanzhou.github.io/
Normal total time: 0.6391615867614746
import aiohttp
async defjob(session):
response=await session.get(URL) #等待并切换return str(response.url)
async defmain(loop):
async with aiohttp.ClientSession() as session: #官网推荐建立Session的形式
tasks=[loop.create_task(job(session)) for _ in range(2)]
finished,unfinished=await asyncio.wait(tasks)
all_results=[r.result() for r in finished] #获取所有结果
print(all_results)
t1=time.time()
loop=asyncio.get_event_loop()
loop.run_until_complete(main(loop))
loop.close()
print('Async total time:',time.time()-t1)
['https://morvanzhou.github.io/', 'https://morvanzhou.github.io/']
Async total time: 0.30881452560424805
和多进程分布式爬虫对比
import aiohttp
import asyncio
import time
from bs4 import BeautifulSoup
from urllib.request import urljoin
import re
import multiprocessing as mp
base_url = "https://morvanzhou.github.io/"# DON'T OVER CRAWL THE WEBSITE OR YOU MAY NEVER VISIT AGAINif base_url != "http://127.0.0.1:4000/":
restricted_crawl = Trueelse:
restricted_crawl = False
seen = set()
unseen = set([base_url])
defparse(html):
soup = BeautifulSoup(html, 'lxml')
urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
title = soup.find('h1').get_text().strip()
page_urls = set([urljoin(base_url, url['href']) for url in urls])
url = soup.find('meta', {'property': "og:url"})['content']
return title, page_urls, url
async defcrawl(url, session):
r = await session.get(url)
html = await r.text()
await asyncio.sleep(0.1) # slightly delay for downloadingreturn html
async defmain(loop):
pool = mp.Pool(8) # slightly affected
async with aiohttp.ClientSession() as session:
count = 1while len(unseen) != 0:
print('\nAsync Crawling...')
tasks = [loop.create_task(crawl(url, session)) for url in unseen]
finished, unfinished = await asyncio.wait(tasks)
htmls = [f.result() for f in finished]
print('\nDistributed Parsing...')
parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
results = [j.get() for j in parse_jobs]
print('\nAnalysing...')
seen.update(unseen)
unseen.clear()
for title, page_urls, url in results:
# print(count, title, url)
unseen.update(page_urls - seen)
count += 1if __name__ == "__main__":
t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
#loop.close()
print("Async total time: ", time.time() - t1)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-42-e494cc65d4bf> in <module>()
61 t1 = time.time()
62 loop = asyncio.get_event_loop()
---> 63 loop.run_until_complete(main(loop))
64 loop.close()
65 print("Async total time: ", time.time() - t1)
~/anaconda2/envs/python35/lib/python3.5/asyncio/base_events.py in run_until_complete(self, future)
441 Return the Future's result, or raise its exception.
442 """
--> 443 self._check_closed()
444
445 new_task = not futures.isfuture(future)
~/anaconda2/envs/python35/lib/python3.5/asyncio/base_events.py in _check_closed(self)
355 def _check_closed(self):
356 if self._closed:
--> 357 raise RuntimeError('Event loop is closed')
358
359 def _asyncgen_finalizer_hook(self, agen):
RuntimeError: Event loop is closed
from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup
import multiprocessing as mp
import re
import time
defcrawl(url):
response = urlopen(url)
time.sleep(0.1) # slightly delay for downloadingreturn response.read().decode()
defparse(html):
soup = BeautifulSoup(html, 'lxml')
urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
title = soup.find('h1').get_text().strip()
page_urls = set([urljoin(base_url, url['href']) for url in urls])
url = soup.find('meta', {'property': "og:url"})['content']
return title, page_urls, url
if __name__ == '__main__':
base_url = 'https://morvanzhou.github.io/'#base_url = "http://127.0.0.1:4000/"# DON'T OVER CRAWL THE WEBSITE OR YOU MAY NEVER VISIT AGAINif base_url != "http://127.0.0.1:4000/":
restricted_crawl = Trueelse:
restricted_crawl = False
unseen = set([base_url,])
seen = set()
pool = mp.Pool(8) # number strongly affected
count, t1 = 1, time.time()
while len(unseen) != 0: # still get some url to visitif restricted_crawl and len(seen) > 20:
break
print('\nDistributed Crawling...')
crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
htmls = [j.get() for j in crawl_jobs] # request connection
htmls = [h for h in htmls if h isnotNone] # remove None
print('\nDistributed Parsing...')
parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
results = [j.get() for j in parse_jobs] # parse html
print('\nAnalysing...')
seen.update(unseen)
unseen.clear()
for title, page_urls, url in results:
# print(count, title, url)
count += 1
unseen.update(page_urls - seen)
print('Total time: %.1f s' % (time.time()-t1, ))
import scrapy
classMofanSpider(scrapy.Spider):
name = "mofan"
start_urls = [
'https://morvanzhou.github.io/',
]
# unseen = set()# seen = set() # we don't need these two as scrapy will deal with them automaticallydefparse(self, response):yield { # return some results'title': response.css('h1::text').extract_first(default='Missing').strip().replace('"', ""),
'url': response.url,
}
urls = response.css('a::attr(href)').re(r'^/.+?/$') # find all sub urlsfor url in urls:
yield response.follow(url, callback=self.parse) # it will filter duplication automatically# lastly, run this in terminal# scrapy runspider 5-2-scrapy.py -o res.json