目标:使用Python编写爬虫,获取链家青岛站的房产信息,然后对爬取的房产信息进行分析。
环境:win10+python3.8+pycharm
Python库:
1 import requests2 import bs43 from bs4 import BeautifulSoup4 import lxml5 import re6 import xlrd7 import xlwt8 import xlutils.copy9 import time
目标分析:
1、编写爬虫爬取链家青岛站的房产信息
①分析目标链接
第一页:https://qd.fang.lianjia.com/loupan/pg/pg1
第二页:https://qd.fang.lianjia.com/loupan/pg/pg2
由上面的链接可以看出来,不同网页是使用最后的pgx来进行变化的
所以将链接分为两部分,使用字符串拼接获得所有的房产网页链接
1 WebDiZhi = []2 for i in range(1,85):3 UrlHTML = Url + str(i)4 WebDiZhi.append(UrlHTML)
使用遍历获得所有的链接并保存为列表
②分析网页结构
1 #获取目标网页的html代码并进行解析 2 Xu = 0 3 Shuliang = len(WebDiZhi) 4 while Xu in range(Shuliang):#循环整个列表 5 6 Web = requests.get(WebDiZhi[Xu]) 7 WebText = Web.text 8 9 #第一步、粗筛选目标信息所在的html代码,去除大部分无效信息代码10 soup_One = BeautifulSoup(WebText,'html.parser')11 XinXi_One = soup_One.find_all(class_="resblock-list-wrapper")12 13 #第二步、进一步筛选目标信息所在html代码,去除无效信息代码14 soup_Two = BeautifulSoup(str(XinXi_One),'lxml')15 XinXi_Two = soup_Two.find_all(class_="resblock-desc-wrapper")
通过两步简单的筛选将房产信息所对应的html代码筛选出来
方便进一步分析html网页标签获取不同的房产信息
③针对不同的房产信息定义不同的函数,通过调用函数来获取不同的房产信息并保存到目标文件中
1 print("-----------------开始写入第{}页-------------".format(Xu)) 2 Name = GetName(XinXi_Two) # 获取小区名称 3 Write_File(Name, 0,Xu) 4 print("---------小区名称写入成功---------") 5 time.sleep(3) #延时 6 Nature = NatureHouse(XinXi_Two) # 获取小区住宅性质(住宅、商业性) 7 Write_File(Nature, 1,Xu) 8 print("---------小区性质写入成功---------") 9 time.sleep(3)10 Status = StatusHouse(XinXi_Two) # 获取小区状态(在售)11 Write_File(Status, 2,Xu)12 print("---------小区状态写入成功---------")13 time.sleep(3)14 Address = AddressHouse(XinXi_Two) # 获取小区地址15 Write_File(Address, 3,Xu)16 print("---------小区地址写入成功---------")17 time.sleep(3)18 Area = AreaHouse(XinXi_Two) # 获取小区房屋面积19 Write_File(Area, 4,Xu)20 print("---------小区面积写入成功---------")21 time.sleep(3)22 Average = AveragePriceHouse(XinXi_Two) # 均价23 Write_File(Average, 5,Xu)24 print("---------小区均价写入成功---------")25 time.sleep(3)26 Total = TotalPriceHouse(XinXi_Two) # 总价27 Write_File(Total, 6,Xu)28 print("---------小区总价写入成功---------")29 time.sleep(3)
各房产信息函数
1 def Write_File(Data, lei,Hang): 2 data = xlrd.open_workbook(r"F:实例Python实例爬虫111.xls") 3 ws = xlutils.copy.copy(data) 4 table = ws.get_sheet(0) 5 Shu = Hang * 10 6 for i in range(len(Data)): 7 table.write(i + 1 + Shu, lei, Data[i]) 8 print("----第{}项写入成功----".format(i)) 9 ws.save(r"F:实例Python实例爬虫111.xls") 10 11 12 def GetName(XinXi): 13 """ 14 @param XinXi: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息 15 @return: 返回小区名称,列表类型 16 """ 17 Nmae_list = [] 18 # 获取小区名称 19 Obtain_Name_One = BeautifulSoup(str(XinXi), 'lxml') 20 Name_One = Obtain_Name_One.findAll(class_="name") 21 for i in Name_One: 22 Get_A = BeautifulSoup(str(i), 'lxml') 23 Nmae_list.append(Get_A.string) 24 return Nmae_list 25 26 """ 27 代码以及目标信息均已获取,通过不同函数将html代码在对应函数中逐一进行解析获取函数对应信息并保存即可 28 以下为部分函数,其他函数未定义 29 30 """ 31 def NatureHouse(Nature): 32 """房屋性质""" 33 Nature_list = [] 34 Obtain_Nature = BeautifulSoup(str(Nature), 'lxml') 35 Nature_one = Obtain_Nature.find_all(class_='resblock-type') 36 for i in Nature_one: 37 Get_Span = BeautifulSoup(str(i), 'lxml') 38 Nature_list.append(Get_Span.string) 39 return Nature_list 40 41 def StatusHouse(Status): 42 """房屋状态""" 43 Status_list = [] 44 Obtain_Nature = BeautifulSoup(str(Status), 'lxml') 45 Status_one = Obtain_Nature.find_all(class_='sale-status') 46 for i in Status_one: 47 Get_Span = BeautifulSoup(str(i), 'lxml') 48 Status_list.append(Get_Span.string) 49 return Status_list 50 51 def AddressHouse(Area): 52 """ 53 54 55 @param Area:传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息 56 @return: 57 Analysis_Label_xxx:分析标签,xxx:代表第几次分析 58 Target_Information_xxx:目标信息,xxx:代表第几个信息部分,总共分为两部分,以及一个整体信息存储列表Target_Information_list 59 """ 60 #获取标签 61 Target_Information_list = [] 62 Analysis_Label_One = BeautifulSoup(str(Area), 'lxml') 63 # 获取div标签,calss=resblock-location 64 Get_label_One = Analysis_Label_One.find_all(class_='resblock-location') 65 #解析标签并获得span标签 66 Analysis_Label_Two = BeautifulSoup(str(Get_label_One), 'lxml') 67 Get_label_Two = Analysis_Label_Two.find_all(name='span') 68 69 70 #获取span标签里面的文字内容并保存在列表内 71 72 #第一个 73 Target_Information_One = [] 74 for i in Get_label_Two: 75 #使用正则表达式取出内部信息并保存在列表中 76 Information_Str = re.sub(r'<.>','',str(i)) 77 Target_Information_One.append(Information_Str) 78 #将列表内相同小区的地址进行合并,使用循环嵌套获取内容、合并最后保存在列表内 79 i = 1 80 a = 0 81 82 #第二个,第二个信息是在第一个信息的基础上合并列表内的元素得来 83 Target_Information_Two = [] 84 while i <= len(Target_Information_One): 85 while a < i: 86 #将Target_Information_One中每两项进行合并 87 Information_Two = Target_Information_One[a] 88 Information_One = Target_Information_One[i] 89 Information_Three = Information_One + Information_Two 90 91 Target_Information_Two.append(Information_Three) 92 a += 2 93 i += 2 94 95 96 #获取详细地址 97 98 #第三个 99 Target_Information_Three = []100 Span_html_One = Analysis_Label_Two.find_all(name='a')101 for c in Span_html_One:102 Area_Str_1 = re.sub(r'<.>', '', str(c))103 Target_Information_Three.append(Area_Str_1)104 105 106 # 将Target_Information_Two和Target_Information_Three两个列表中的各项元素分别进行合并并保存在Area_list列表中107 A = min(len(Target_Information_Two),len(Target_Information_Three))108 for i in range(A):109 Target_Information_list.append(Target_Information_Two[i] + Target_Information_Three[i])110 111 112 return Target_Information_list113 114 115 def AreaHouse(Area):116 """117 118 @param Area: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息119 @return: 返回房屋房间数量以及房屋总面积120 """121 Area_list = []122 #筛选目标信息的父标签123 Obtain_Area_One = BeautifulSoup(str(Area), 'lxml')124 Area_one = Obtain_Area_One.find_all(class_='resblock-room')125 126 #通过正则表达式去除多余的html标签信息127 Get_Area_One = []128 for c in Area_one:129 Area_Str_1 = re.sub(r'<.>', '', str(c))130 Get_Area_One.append(Area_Str_1)131 132 #通过正则表达式去除多余的换行符133 Get_Area_Two = []134 for i in Get_Area_One:135 Area_Str_2 = re.sub(r's+','',str(i))136 Get_Area_Two.append(Area_Str_2)137 138 139 #开始获取房屋总面积140 Obtain_Area_Two = BeautifulSoup(str(Area),'lxml')141 Area_two = Obtain_Area_Two.find_all(class_='resblock-area')142 #通过正则表达式去除多余的html标签信息143 Get_Area_Three = []144 for a in Area_two:145 Area_Str_3 = re.sub(r'<.>', '', str(a))146 Get_Area_Three.append(Area_Str_3)147 148 # 通过正则表达式去除多余的换行符149 Get_Area_Four = []150 for r in Get_Area_Three:151 Area_Str_4 = re.sub(r's+', '', str(r))152 Get_Area_Four.append(Area_Str_4)153 154 # 将Get_Area_Two和Get_Area_Four两个列表中的各项元素分别进行合并并保存在Area_list列表中155 A = min(len(Get_Area_Two), len(Get_Area_Four))156 for i in range(A):157 Area_list.append(Get_Area_Two[i] + Get_Area_Four[i])158 159 return Area_list160 161 def AveragePriceHouse(Average):162 """163 房屋均价164 @param Average:165 @return:166 """167 Average_list = []168 Obtain_Average = BeautifulSoup(str(Average), 'lxml')169 Average_one = Obtain_Average.find_all(class_='number')170 for i in Average_one:171 Get_Span = BeautifulSoup(str(i), 'lxml')172 Average_list.append(Get_Span.string)173 174 return Average_list175 176 177 178 def TotalPriceHouse(Total):179 """180 房屋总价181 182 @param Total:183 @return:184 """185 Total_list = []186 Obtain_Total = BeautifulSoup(str(Total), 'lxml')187 Total_one = Obtain_Total.fjind_all(class_='second')188 for i in Total_one:189 Get_Span = BeautifulSoup(str(i), 'lxml')190 Get_Span_one = Get_Span.string191 Get_Span_two = Get_Span_one.lstrip('总价')192 Total_list.append(Get_Span_two)193 194 195 return Total_list
创建存储文件
1 def Create_File():2 name = ['名称','性质','状态','地址','面积','均价','总价',]3 workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)4 sheet = workbook.add_sheet('shett1', cell_overwrite_ok=True)5 for i in range(len(name)):6 sheet.write(0, i, name[i])7 workbook.save(r'F:实例Python实例爬虫111.xls')8 print("文件创建成功")
2、简单的数据分析并使用饼状图表示房产均价比例
所用到的库:
import pandas as pdimport xlrdimport reimport xlutils.copyimport matplotlib.pyplot as plt
①数据清洗----删除空值行
1 def ExceptNull(): 2 """ 3 数据清洗第一步:去除表中空值 4 @param df: 传入读取的xls表格数据 5 @return: 保存数据后返回, 6 """ 7 df = pd.DataFrame(pd.read_excel(r'F:实例Python实例爬虫111.xls')) 8 #查找面积列空值,使用99999填充空缺值后删除所在行 9 print(df['面积'].isnull().value_counts())10 df["面积"] = df["面积"].fillna('99999')11 NullKey = df[(df.面积 == '99999')].index.tolist()12 print(NullKey)13 df = df.drop(NullKey)14 print("*"*30)15 print(df['面积'].isnull().value_counts())16 17 print("*"*30)18 #查找总价列空值,使用99999填充空缺值后删除所在行19 print(df['总价'].isnull().value_counts())20 df["总价"] = df["总价"].fillna('99999')21 NullKey1 = df[(df.总价 == '99999')].index.tolist()22 print(NullKey1)23 df = df.drop(NullKey1)24 print("*"*30)25 print(df['总价'].isnull().value_counts())26 df.to_excel('111.xls',index=False,encoding='uf-8')27 28 29 print("修改后数据保存成功")
②数据预处理----将数据转换成易处理格式
1 def LeiChuli(): 2 Data = xlrd.open_workbook(r"F:实例Python实例爬虫111.xls") 3 ws = xlutils.copy.copy(Data) 4 Table = Data.sheet_by_name("Sheet1") 5 Nrows = Table.nrows 6 list_A = [] 7 for i in range(1,Nrows): 8 A = Table.cell_value(i,6) 9 A_Str = re.sub('/套','',A,Nrows)10 list_A.append(A_Str)11 Replace = []12 for i in range(len(list_A)):13 Price_Str = list_A[i]14 Last_Str = Price_Str[-1]15 if Last_Str == '万':16 A_Str = re.sub('万', '0000', Price_Str, 1)17 Replace.append(A_Str)18 else:19 Replace.append(Price_Str)20 table = ws.get_sheet(0)21 for i in range(len(Replace)):22 table.write(i + 1, 6, Replace[i])23 print("------>开始写入修改后数据第{}项写入成功数据写入完成
③对处理后的数据进行分析并绘制饼状图
1 def Data_Analysis_One(): 2 Data = xlrd.open_workbook(r"F:实例Python实例爬虫111.xls") 3 ws = xlutils.copy.copy(Data) 4 Table = Data.sheet_by_name("Sheet1") 5 Nrows = Table.nrows 6 a,b,c,d,e,f = 0,0,0,0,0,0 7 8 for i in range(1, Nrows): 9 A = Table.cell_value(i, 5)10 if A == "价格待定":11 f += 112 else:13 if int(A) <= 5000:14 a += 115 elif int(A) <= 10000:16 b += 117 elif int(A) <= 15000:18 c += 119 elif int(A) <= 20000:20 d += 121 else:22 e += 123 24 # 开始准备绘制饼状图25 26 #价格区间数据准备27 sizes = []28 Percentage_a = (a / Nrows) * 10029 sizes.append(int(Percentage_a))30 Percentage_b = (b / Nrows) * 10031 sizes.append(int(Percentage_b))32 Percentage_c = (c / Nrows) * 10033 sizes.append(int(Percentage_c))34 Percentage_d = (d / Nrows) * 10035 sizes.append(int(Percentage_d))36 Percentage_e = (e / Nrows) * 10037 sizes.append(int(Percentage_e))38 Percentage_f = (f / Nrows) * 10039 sizes.append(int(Percentage_f))40 #设置占比说明41 labels = '0-5000','5001-10000','10001-15000','15001-20000','20000-','Undetermined'42 explode = (0,0,0.1,0,0,0)43 #开始绘制44 plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=True,startangle=90)45 plt.axis('equal')46 plt.show()47 ws.save(r"F:实例Python实例爬虫111.xls")
最后附上效果图。
最后附上完整代码:
1、爬虫代码
1 import requests 2 import bs4 3 from bs4 import BeautifulSoup 4 import lxml 5 import re 6 # import LianJia_QD_DataProcessing 7 import xlrd 8 import xlwt 9 import xlutils.copy 10 import time 11 12 def GetHTML(Url): 13 """ 14 1、通过传入url组合,获取所有网页地址的url 15 2、获取目标网页的html代码并进行解析 16 3、解析后将目标信息分别写入字典类型的变量并返回 17 18 @param Url: 目标网址的不变链接 19 @return: 网站目标信息 20 21 """ 22 23 #通过传入url组合,获取所有网页地址的url 24 WebDiZhi = [] 25 for i in range(1,85): 26 UrlHTML = Url + str(i) 27 WebDiZhi.append(UrlHTML) 28 29 print("共计{}页".format(len(WebDiZhi))) 30 # Create_File() 31 #获取目标网页的html代码并进行解析 32 Xu = 0 33 Shuliang = len(WebDiZhi) 34 while Xu in range(Shuliang):#range(len(WebDiZhi))--循环整个列表 35 36 Web = requests.get(WebDiZhi[Xu]) 37 WebText = Web.text 38 39 #第一步、粗筛选目标信息所在的html代码,去除大部分无效信息代码 40 soup_One = BeautifulSoup(WebText,'html.parser') 41 XinXi_One = soup_One.find_all(class_="resblock-list-wrapper") 42 43 #第二步、进一步筛选目标信息所在html代码,去除无效信息代码 44 soup_Two = BeautifulSoup(str(XinXi_One),'lxml') 45 XinXi_Two = soup_Two.find_all(class_="resblock-desc-wrapper") 46 47 print("-----------------第{}页爬取成功------------".format(Xu)) 48 # Html.append(XinXi_Two) 49 # time.sleep(1) 50 # return Html 51 52 print("-----------------开始写入第{}页-------------".format(Xu)) 53 Name = GetName(XinXi_Two) # 获取小区名称 54 Write_File(Name, 0,Xu) 55 print("---------小区名称写入成功---------") 56 time.sleep(3) 57 Nature = NatureHouse(XinXi_Two) # 获取小区住宅性质(住宅、商业性) 58 Write_File(Nature, 1,Xu) 59 print("---------小区性质写入成功---------") 60 time.sleep(3) 61 Status = StatusHouse(XinXi_Two) # 获取小区状态(在售) 62 Write_File(Status, 2,Xu) 63 print("---------小区状态写入成功---------") 64 time.sleep(3) 65 Address = AddressHouse(XinXi_Two) # 获取小区地址 66 Write_File(Address, 3,Xu) 67 print("---------小区地址写入成功---------") 68 time.sleep(3) 69 Area = AreaHouse(XinXi_Two) # 获取小区房屋面积 70 Write_File(Area, 4,Xu) 71 print("---------小区面积写入成功---------") 72 time.sleep(3) 73 Average = AveragePriceHouse(XinXi_Two) # 均价 74 Write_File(Average, 5,Xu) 75 print("---------小区均价写入成功---------") 76 time.sleep(3) 77 Total = TotalPriceHouse(XinXi_Two) # 总价 78 Write_File(Total, 6,Xu) 79 print("---------小区总价写入成功---------") 80 time.sleep(3) 81 82 Xu += 1 83 84 # 调用不同函数获取不同信息 85 86 87 def Write_File(Data, lei,Hang): 88 data = xlrd.open_workbook(r"F:实例Python实例爬虫111.xls") 89 ws = xlutils.copy.copy(data) 90 table = ws.get_sheet(0) 91 Shu = Hang * 10 92 for i in range(len(Data)): 93 table.write(i + 1 + Shu, lei, Data[i]) 94 print("----第{}项写入成功----".format(i)) 95 ws.save(r"F:实例Python实例爬虫111.xls") 96 97 98 def GetName(XinXi): 99 """100 @param XinXi: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息101 @return: 返回小区名称,列表类型102 """103 Nmae_list = []104 # 获取小区名称105 Obtain_Name_One = BeautifulSoup(str(XinXi), 'lxml')106 Name_One = Obtain_Name_One.findAll(class_="name")107 for i in Name_One:108 Get_A = BeautifulSoup(str(i), 'lxml')109 Nmae_list.append(Get_A.string)110 return Nmae_list111 112 """113 代码以及目标信息均已获取,通过不同函数将html代码在对应函数中逐一进行解析获取函数对应信息并保存即可114 以下为部分函数,其他函数未定义115 116 """117 def NatureHouse(Nature):118 """房屋性质"""119 Nature_list = []120 Obtain_Nature = BeautifulSoup(str(Nature), 'lxml')121 Nature_one = Obtain_Nature.find_all(class_='resblock-type')122 for i in Nature_one:123 Get_Span = BeautifulSoup(str(i), 'lxml')124 Nature_list.append(Get_Span.string)125 return Nature_list126 127 def StatusHouse(Status):128 """房屋状态"""129 Status_list = []130 Obtain_Nature = BeautifulSoup(str(Status), 'lxml')131 Status_one = Obtain_Nature.find_all(class_='sale-status')132 for i in Status_one:133 Get_Span = BeautifulSoup(str(i), 'lxml')134 Status_list.append(Get_Span.string)135 return Status_list136 137 def AddressHouse(Area):138 """139 140 141 @param Area:传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息142 @return:143 Analysis_Label_xxx:分析标签,xxx:代表第几次分析144 Target_Information_xxx:目标信息,xxx:代表第几个信息部分,总共分为两部分,以及一个整体信息存储列表Target_Information_list145 """146 #获取标签147 Target_Information_list = []148 Analysis_Label_One = BeautifulSoup(str(Area), 'lxml')149 # 获取div标签,calss=resblock-location150 Get_label_One = Analysis_Label_One.find_all(class_='resblock-location')151 #解析标签并获得span标签152 Analysis_Label_Two = BeautifulSoup(str(Get_label_One), 'lxml')153 Get_label_Two = Analysis_Label_Two.find_all(name='span')154 155 156 #获取span标签里面的文字内容并保存在列表内157 158 #第一个159 Target_Information_One = []160 for i in Get_label_Two:161 #使用正则表达式取出内部信息并保存在列表中162 Information_Str = re.sub(r'<.>','',str(i))163 Target_Information_One.append(Information_Str)164 #将列表内相同小区的地址进行合并,使用循环嵌套获取内容、合并最后保存在列表内165 i = 1166 a = 0167 168 #第二个,第二个信息是在第一个信息的基础上合并列表内的元素得来169 Target_Information_Two = []170 while i <= len(Target_Information_One):171 while a < i:172 #将Target_Information_One中每两项进行合并173 Information_Two = Target_Information_One[a]174 Information_One = Target_Information_One[i]175 Information_Three = Information_One + Information_Two176 177 Target_Information_Two.append(Information_Three)178 a += 2179 i += 2180 181 182 #获取详细地址183 184 #第三个185 Target_Information_Three = []186 Span_html_One = Analysis_Label_Two.find_all(name='a')187 for c in Span_html_One:188 Area_Str_1 = re.sub(r'<.>', '', str(c))189 Target_Information_Three.append(Area_Str_1)190 191 192 # 将Target_Information_Two和Target_Information_Three两个列表中的各项元素分别进行合并并保存在Area_list列表中193 A = min(len(Target_Information_Two),len(Target_Information_Three))194 for i in range(A):195 Target_Information_list.append(Target_Information_Two[i] + Target_Information_Three[i])196 197 198 return Target_Information_list199 200 201 def AreaHouse(Area):202 """203 204 @param Area: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息205 @return: 返回房屋房间数量以及房屋总面积206 """207 Area_list = []208 #筛选目标信息的父标签209 Obtain_Area_One = BeautifulSoup(str(Area), 'lxml')210 Area_one = Obtain_Area_One.find_all(class_='resblock-room')211 212 #通过正则表达式去除多余的html标签信息213 Get_Area_One = []214 for c in Area_one:215 Area_Str_1 = re.sub(r'<.>', '', str(c))216 Get_Area_One.append(Area_Str_1)217 218 #通过正则表达式去除多余的换行符219 Get_Area_Two = []220 for i in Get_Area_One:221 Area_Str_2 = re.sub(r's+','',str(i))222 Get_Area_Two.append(Area_Str_2)223 224 225 #开始获取房屋总面积226 Obtain_Area_Two = BeautifulSoup(str(Area),'lxml')227 Area_two = Obtain_Area_Two.find_all(class_='resblock-area')228 #通过正则表达式去除多余的html标签信息229 Get_Area_Three = []230 for a in Area_two:231 Area_Str_3 = re.sub(r'<.>', '', str(a))232 Get_Area_Three.append(Area_Str_3)233 234 # 通过正则表达式去除多余的换行符235 Get_Area_Four = []236 for r in Get_Area_Three:237 Area_Str_4 = re.sub(r's+', '', str(r))238 Get_Area_Four.append(Area_Str_4)239 240 # 将Get_Area_Two和Get_Area_Four两个列表中的各项元素分别进行合并并保存在Area_list列表中241 A = min(len(Get_Area_Two), len(Get_Area_Four))242 for i in range(A):243 Area_list.append(Get_Area_Two[i] + Get_Area_Four[i])244 245 return Area_list246 247 def AveragePriceHouse(Average):248 """249 房屋均价250 @param Average:251 @return:252 """253 Average_list = []254 Obtain_Average = BeautifulSoup(str(Average), 'lxml')255 Average_one = Obtain_Average.find_all(class_='number')256 for i in Average_one:257 Get_Span = BeautifulSoup(str(i), 'lxml')258 Average_list.append(Get_Span.string)259 260 return Average_list261 262 263 264 def TotalPriceHouse(Total):265 """266 房屋总价267 268 @param Total:269 @return:270 """271 Total_list = []272 Obtain_Total = BeautifulSoup(str(Total), 'lxml')273 Total_one = Obtain_Total.fjind_all(class_='second')274 for i in Total_one:275 Get_Span = BeautifulSoup(str(i), 'lxml')276 Get_Span_one = Get_Span.string277 Get_Span_two = Get_Span_one.lstrip('总价')278 Total_list.append(Get_Span_two)279 280 281 return Total_list282 283 284 def Create_File():285 name = ['名称','性质','状态','地址','面积','均价','总价',]286 workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)287 sheet = workbook.add_sheet('shett1', cell_overwrite_ok=True)288 for i in range(len(name)):289 sheet.write(0, i, name[i])290 workbook.save(r'F:实例Python实例爬虫111.xls')291 print("文件创建成功")292 293 294 if __name__ == '__main__':295 url = "https://qd.fang.lianjia.com/loupan/pg"296 Create_File()297 DataHtml = GetHTML(url)298 299 print("全部房产信息写入成功")
2、数据处理代码
1 import pandas as pd 2 import xlrd 3 import re 4 import xlutils.copy 5 import matplotlib.pyplot as plt 6 7 def ExceptNull(): 8 """ 9 数据清洗第一步:去除表中空值 10 @param df: 传入读取的xls表格数据 11 @return: 保存数据后返回, 12 """ 13 df = pd.DataFrame(pd.read_excel(r'F:实例Python实例爬虫111.xls')) 14 #查找面积列空值,使用99999填充空缺值后删除所在行 15 print(df['面积'].isnull().value_counts()) 16 df["面积"] = df["面积"].fillna('99999') 17 NullKey = df[(df.面积 == '99999')].index.tolist() 18 print(NullKey) 19 df = df.drop(NullKey) 20 print("*"*30) 21 print(df['面积'].isnull().value_counts()) 22 23 print("*"*30) 24 #查找总价列空值,使用99999填充空缺值后删除所在行 25 print(df['总价'].isnull().value_counts()) 26 df["总价"] = df["总价"].fillna('99999') 27 NullKey1 = df[(df.总价 == '99999')].index.tolist() 28 print(NullKey1) 29 df = df.drop(NullKey1) 30 print("*"*30) 31 print(df['总价'].isnull().value_counts()) 32 df.to_excel('111.xls',index=False,encoding='uf-8') 33 34 35 print("修改后数据保存成功") 36 37 38 def LeiChuli(): 39 Data = xlrd.open_workbook(r"F:实例Python实例爬虫111.xls") 40 ws = xlutils.copy.copy(Data) 41 Table = Data.sheet_by_name("Sheet1") 42 Nrows = Table.nrows 43 list_A = [] 44 for i in range(1,Nrows): 45 A = Table.cell_value(i,6) 46 A_Str = re.sub('/套','',A,Nrows) 47 list_A.append(A_Str) 48 Replace = [] 49 for i in range(len(list_A)): 50 Price_Str = list_A[i] 51 Last_Str = Price_Str[-1] 52 if Last_Str == '万': 53 A_Str = re.sub('万', '0000', Price_Str, 1) 54 Replace.append(A_Str) 55 else: 56 Replace.append(Price_Str) 57 table = ws.get_sheet(0) 58 for i in range(len(Replace)): 59 table.write(i + 1, 6, Replace[i]) 60 print("------>开始写入修改后数据第{}项写入成功数据写入完成
数据来源于链家青岛站部分数据,因为一些原因爬取结果可能不是完全符合预期。
私信小编01 我教你爬虫和数据分析呀 源码获取哦!