近期听课的思考
近期有机会听了听天善智能的课程《自然语言处理之AI深度学习顶级实战课程》慢慢的有一些心得,以后有机会慢慢给大家分享出来。
为什么微软称NLP 为人工智能“皇冠上的明珠”?----认知智能
深度学习在自然语言处理的通用步骤
- 论文的阅读,最新算法的研究
- 算法的大概方向的评估训练和确定
- 训练数据的收集,清洗以及数据的预处理
- 算法实现,系统设计,参数调优,模型升级
- 模型效果评估与部署
语料库的记录
其实对于很多公司来说,要做NLP的一个最大的问题就是语料库的积累,包括词向量,知识库等等。这些东西最好的来源是什么呢?–爬虫。
爬虫最常用的三种手段:
1.urllib.request
构造页面post 请求
2.scrapy
如果有非常详细的 网站树形结构,使用该框架爬取非常快捷方便
3.selenium
自动化测试利器,针对动态请求,url没有变化的网站类型有奇特疗效
以下分别针对上述三种爬取方式给出实例代码
3种爬虫
urllib.request + BeautifulSoup
主要思路,遍历分页列表–>获取每一页的博客链接–>依次爬取博客内容
'''
@author: season
@contact:
@file: spider_for_csdn.py
@time: 2018/10/16 21:32
@desc:
'''
import io
import os
import sys
import urllib
from urllib.request import urlopen
from urllib import request
from bs4 import BeautifulSoup
import datetime
import random
import re
import requests
import socket
socket.setdefaulttimeout(5000)
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')
headers1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
headers2 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
headers3 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
articles = set()
def getArticleLinks(pageUrl):
proxy_handler = urllib.request.ProxyHandler({'post': '49.51.195.24:1080'})
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)
req = request.Request(pageUrl, headers=headers1 or headers2 or headers3)
html = urlopen(req)
bsObj = BeautifulSoup(html.read(), "html.parser")
global articles
for articlelist in bsObj.findAll("h4"):
if 'href' in articlelist.a.attrs:
if articlelist.a.attrs["href"] not in articles:
newArticle = articlelist.a.attrs["href"]
articles.add(newArticle)
pages = set()
def getPageLinks(bokezhuye):
proxy_handler = urllib.request.ProxyHandler({'post': '49.51.195.24:1080'})
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
urllib.request.install_opener(opener)
req = request.Request(bokezhuye, headers=headers1 or headers2 or headers3)
html = urlopen(req)
bsObj = BeautifulSoup(html.read(), "html.parser")
getArticleLinks(bokezhuye)
global pages
for pagelist in bsObj.findAll("a", href=re.compile("^/([A-Za-z0-9]+)(/article)(/list)(/[0-9]+)*$")):
if 'href' in pagelist.attrs:
if pagelist.attrs["href"] not in pages:
newPage = pagelist.attrs["href"]
pages.add(newPage)
newPageLink = "http://blog.csdn.net/" + newPage
getArticleLinks(newPageLink)
for articlelist in articles:
newarticlelist = "http://blog.csdn.net/" + articlelist
print(newarticlelist)
getArticleText(newarticlelist)
str_page_url_prefix = 'https://blog.csdn.net/wangyaninglm/'
list_page_str = str_page_url_prefix + 'article/list/'
for i in range(1,18):
getPageLinks(list_page_str+ str(i))
page_url_list = []
page_url_pattern = "(" + str_page_url_prefix + "article/details)(/[0-9]+)*$"
for page_link in articles:
if re.match(page_url_pattern,page_link):
page_url_list.append(page_link)
else:
pass
print(len(page_url_list))
dict_page_content = {'title':'','content':''}
list_page_content = []
import spider_for_403
for url in page_url_list:
spider_for_403.get_Content(url,'blog-content-box','title-article','article_content')
在爬取的过程中发现403报错,于是写了下面文件,更多的浏览器头
import urllib
import urllib.request
import random
from bs4 import BeautifulSoup
import urllib.error
my_headers = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"
]
import re
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|]"
new_title = re.sub(rstr, "_", title)
return new_title.replace('\r','').replace('\n','').replace('\t','')
def get_Content(url,contend_box_id,title_id,contend_id):
try:
randdom_header = random.choice(my_headers)
req = urllib.request.Request(url)
req.add_header("User-Agent", randdom_header)
req.add_header("GET", url)
response = urllib.request.urlopen(req)
bsObj = BeautifulSoup(response.read(), "html.parser")
title = bsObj.findAll(name='h1',attrs={'class':title_id})
str_title = validateTitle(title[0].get_text() + '.txt')
print(str_title.encode('gbk'))
f_blog = open('blog//' + str_title, 'w', encoding='utf-8')
for content_box in bsObj.findAll(name='div',attrs={'class':contend_box_id}):
for contend in bsObj.findAll(name='div',id = contend_id):
str_content = 'content' + '\n'+ contend.get_text() + '\n'
f_blog.write(str_content)
f_blog.close()
response.close()
except OSError as e:
print(e)
except urllib.error.URLError as e:
print(e.reason)
效果:

scrapy 与xpath
在pycharm 中调试 scrapy
from scrapy import cmdline
cmdline.execute('scrapy crawl Hospital'.split())
写好spider 的解析函数
class HospitalSpider(Spider):
i = 2;
name = 'Hospital'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
}
jianjie = 'jianjie.html'
base_url = 'https://yyk.99.com.cn'
def start_requests(self):
url = 'https://yyk.99.com.cn/city.html'
yield Request(url, headers=self.headers)
def parse(self, response):
hospitals_sub_url = response.xpath(
'//div[@class="m-clump"]//dt/a[@href]/@href').extract()[:31]
for url in hospitals_sub_url:
url = str(self.base_url + url)
yield Request(url, callback=self.parse_dir_urls)
def parse_dir_urls(self, response):
hospitals_sub_url = response.xpath(
'//div[@class="m-table-2"]//td/a[@href]/@href').extract()
for url in hospitals_sub_url:
url = str(self.base_url + url+ self.jianjie)
yield Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = HospitalspiderItem()
item.item_dict['更新日期'] = response.xpath('//div[@class="crumb"]//font/text()').extract()
item.item_dict['所在地区'] = response.xpath('//table[@class="present-table"]//tr[1]/td[4]/a/text()').extract()
item.item_dict['简介'] = response.xpath('//div[@class="present-wrap1"]//div[@id="txtintro"]').extract()
yield item
pipeline 对于 依次爬取的item 进行处理,此处写成csv ,参照item 类进行数据持久化
pipeline
import re
from HospitalSpider import items
class HospitalspiderPipeline(object):
csv_head = items.HospitalspiderItem()
def clean_html(self,str):
reg = re.compile('<[^>]*>')
return reg.sub('', str)
def add_yinhao(self, str):
if str:
return '"' + str + '"'
else:
return ''
def write_csv_line(self, item):
str_row = ''
for i in item.item_list:
if item.item_dict[i]:
str_row = str_row + self.add_yinhao(self.clean_html(str(item.item_dict[i][0]))) + ','
str_row = str_row.strip(',').replace('\r','').replace('\n','').replace('\t','').replace(' ','') + '\n'
return str_row
def __init__(self):
pass
def open_spider(self, spider):
self.file = open('hospital.csv', 'w', encoding='utf-8')
str_row = ''
for i in self.csv_head.item_list:
str_row = str_row + '"' +i+'"'+','
self.file.write((str_row.strip(',')+'\n'))
def process_item(self, item, spider):
self.file.write(self.write_csv_line(item))
def close_spider(self, spider):
self.file.close()
修改 settings.py 文件
ITEM_PIPELINES = {
'HospitalSpider.pipelines.HospitalspiderPipeline': 300,
}
使用selenium 模拟浏览器行为
'''
@author: season
@contact:
@file: main.py
@time: 2018/11/16 14:24
@desc:
'''
import selenium
from selenium import webdriver
import file_operator
def get_Page_all_detail(handle_web_driver,str_xpath):
list_diag_test = handle_web_driver.find_elements_by_xpath(str_xpath)
list_Registration_number = []
for element in list_diag_test:
list_Registration_number.append(element.text)
list_already_have = file_operator.all_pure_file_name_without_extension(r'./html/','.html')
list_Registration_number = file_operator.sub_list(list_already_have,list_Registration_number)
for Registration_number in list_Registration_number:
handle_web_driver.find_element_by_link_text(Registration_number).click()
handle_web_driver.implicitly_wait(1)
with open(r'./html/'+Registration_number+'.html','w',encoding='utf-8') as html_file:
page_html = handle_web_driver.page_source
html_file.write(page_html)
handle_web_driver.back()
def send_click(url):
browser = webdriver.Chrome()
browser.get(url)
browser.implicitly_wait(1)
str_xpath = '//tr[contains(@style, " color:#535353")]/td[2]'
next_page_element_number = int(browser.find_element_by_xpath('// *[ @ id = "searchfrm"] / div / div[4] / div[1]/a[3]').text)
for index in range(0,next_page_element_number):
get_Page_all_detail(browser, str_xpath)
next_button = browser.find_element_by_xpath('//input[contains(@class, "page_next ui-button ui-widget ui-state-default ui-corner-all")]')
next_button.click()
def main():
str_url_base = 'http://www.search.keywords='
str_diagnosis = '***'
send_click(str_url_base+str_diagnosis)
if __name__ == '__main__':
main()
新的改变
我还没写完程序,后序代码和过程逐步贴上来
主要计划是,使用我自己的博客作为语料进行,词云,tf-idf ,textrank 等算法的分析
相关链接
NLP系列文章:
NLP 系列文章代码
NLP 下载资源
未完待续 |