python延时函数_Python基础——爬虫以及简单的数据分析!入门非常简单

论坛 期权论坛 编程之家     
选择匿名的用户   2021-5-26 10:59   49   0

目标:使用Python编写爬虫,获取链家青岛站的房产信息,然后对爬取的房产信息进行分析。

环境:win10+python3.8+pycharm

Python库:

1 import requests2 import bs43 from bs4 import BeautifulSoup4 import lxml5 import re6 import xlrd7 import xlwt8 import xlutils.copy9 import time

目标分析:

1、编写爬虫爬取链家青岛站的房产信息

①分析目标链接

第一页:https://qd.fang.lianjia.com/loupan/pg/pg1

第二页:https://qd.fang.lianjia.com/loupan/pg/pg2

由上面的链接可以看出来,不同网页是使用最后的pgx来进行变化的

所以将链接分为两部分,使用字符串拼接获得所有的房产网页链接

1   WebDiZhi = []2     for i in range(1,85):3         UrlHTML = Url + str(i)4         WebDiZhi.append(UrlHTML)

使用遍历获得所有的链接并保存为列表

②分析网页结构

 1 #获取目标网页的html代码并进行解析 2     Xu = 0 3     Shuliang = len(WebDiZhi) 4     while Xu in range(Shuliang):#循环整个列表 5  6         Web = requests.get(WebDiZhi[Xu]) 7         WebText = Web.text 8  9         #第一步、粗筛选目标信息所在的html代码,去除大部分无效信息代码10         soup_One = BeautifulSoup(WebText,'html.parser')11         XinXi_One = soup_One.find_all(class_="resblock-list-wrapper")12 13         #第二步、进一步筛选目标信息所在html代码,去除无效信息代码14         soup_Two = BeautifulSoup(str(XinXi_One),'lxml')15         XinXi_Two = soup_Two.find_all(class_="resblock-desc-wrapper")

通过两步简单的筛选将房产信息所对应的html代码筛选出来

方便进一步分析html网页标签获取不同的房产信息

③针对不同的房产信息定义不同的函数,通过调用函数来获取不同的房产信息并保存到目标文件中

 1  print("-----------------开始写入第{}页-------------".format(Xu)) 2         Name = GetName(XinXi_Two)  # 获取小区名称 3         Write_File(Name, 0,Xu) 4         print("---------小区名称写入成功---------") 5         time.sleep(3)  #延时 6         Nature = NatureHouse(XinXi_Two)  # 获取小区住宅性质(住宅、商业性) 7         Write_File(Nature, 1,Xu) 8         print("---------小区性质写入成功---------") 9         time.sleep(3)10         Status = StatusHouse(XinXi_Two)  # 获取小区状态(在售)11         Write_File(Status, 2,Xu)12         print("---------小区状态写入成功---------")13         time.sleep(3)14         Address = AddressHouse(XinXi_Two)  # 获取小区地址15         Write_File(Address, 3,Xu)16         print("---------小区地址写入成功---------")17         time.sleep(3)18         Area = AreaHouse(XinXi_Two)  # 获取小区房屋面积19         Write_File(Area, 4,Xu)20         print("---------小区面积写入成功---------")21         time.sleep(3)22         Average = AveragePriceHouse(XinXi_Two)  # 均价23         Write_File(Average, 5,Xu)24         print("---------小区均价写入成功---------")25         time.sleep(3)26         Total = TotalPriceHouse(XinXi_Two)  # 总价27         Write_File(Total, 6,Xu)28         print("---------小区总价写入成功---------")29         time.sleep(3)

各房产信息函数

  1 def Write_File(Data, lei,Hang):  2     data = xlrd.open_workbook(r"F:实例Python实例爬虫111.xls")  3     ws = xlutils.copy.copy(data)  4     table = ws.get_sheet(0)  5     Shu = Hang * 10  6     for i in range(len(Data)):  7         table.write(i + 1 + Shu, lei, Data[i])  8         print("----第{}项写入成功----".format(i))  9         ws.save(r"F:实例Python实例爬虫111.xls") 10  11  12 def GetName(XinXi): 13     """ 14     @param XinXi: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息 15     @return: 返回小区名称,列表类型 16     """ 17     Nmae_list = [] 18     # 获取小区名称 19     Obtain_Name_One = BeautifulSoup(str(XinXi), 'lxml') 20     Name_One = Obtain_Name_One.findAll(class_="name") 21     for i in Name_One: 22         Get_A = BeautifulSoup(str(i), 'lxml') 23         Nmae_list.append(Get_A.string) 24     return Nmae_list 25  26 """ 27 代码以及目标信息均已获取,通过不同函数将html代码在对应函数中逐一进行解析获取函数对应信息并保存即可 28 以下为部分函数,其他函数未定义 29  30 """ 31 def NatureHouse(Nature): 32     """房屋性质""" 33     Nature_list = [] 34     Obtain_Nature = BeautifulSoup(str(Nature), 'lxml') 35     Nature_one = Obtain_Nature.find_all(class_='resblock-type') 36     for i in Nature_one: 37         Get_Span = BeautifulSoup(str(i), 'lxml') 38         Nature_list.append(Get_Span.string) 39     return Nature_list 40  41 def StatusHouse(Status): 42     """房屋状态""" 43     Status_list = [] 44     Obtain_Nature = BeautifulSoup(str(Status), 'lxml') 45     Status_one = Obtain_Nature.find_all(class_='sale-status') 46     for i in Status_one: 47         Get_Span = BeautifulSoup(str(i), 'lxml') 48         Status_list.append(Get_Span.string) 49     return Status_list 50  51 def AddressHouse(Area): 52     """ 53  54  55     @param Area:传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息 56     @return: 57     Analysis_Label_xxx:分析标签,xxx:代表第几次分析 58     Target_Information_xxx:目标信息,xxx:代表第几个信息部分,总共分为两部分,以及一个整体信息存储列表Target_Information_list 59     """ 60     #获取标签 61     Target_Information_list = [] 62     Analysis_Label_One = BeautifulSoup(str(Area), 'lxml') 63     # 获取div标签,calss=resblock-location 64     Get_label_One = Analysis_Label_One.find_all(class_='resblock-location') 65     #解析标签并获得span标签 66     Analysis_Label_Two = BeautifulSoup(str(Get_label_One), 'lxml') 67     Get_label_Two = Analysis_Label_Two.find_all(name='span') 68  69  70     #获取span标签里面的文字内容并保存在列表内 71  72     #第一个 73     Target_Information_One = [] 74     for i in Get_label_Two: 75         #使用正则表达式取出内部信息并保存在列表中 76         Information_Str = re.sub(r'<.>','',str(i)) 77         Target_Information_One.append(Information_Str) 78     #将列表内相同小区的地址进行合并,使用循环嵌套获取内容、合并最后保存在列表内 79     i = 1 80     a = 0 81  82     #第二个,第二个信息是在第一个信息的基础上合并列表内的元素得来 83     Target_Information_Two = [] 84     while i <= len(Target_Information_One): 85         while a < i: 86             #将Target_Information_One中每两项进行合并 87             Information_Two = Target_Information_One[a] 88             Information_One = Target_Information_One[i] 89             Information_Three = Information_One + Information_Two 90  91             Target_Information_Two.append(Information_Three) 92             a += 2 93         i += 2 94  95  96     #获取详细地址 97  98     #第三个 99     Target_Information_Three = []100     Span_html_One = Analysis_Label_Two.find_all(name='a')101     for c in Span_html_One:102         Area_Str_1 = re.sub(r'<.>', '', str(c))103         Target_Information_Three.append(Area_Str_1)104 105 106     # 将Target_Information_Two和Target_Information_Three两个列表中的各项元素分别进行合并并保存在Area_list列表中107     A = min(len(Target_Information_Two),len(Target_Information_Three))108     for i in range(A):109         Target_Information_list.append(Target_Information_Two[i] + Target_Information_Three[i])110 111 112     return Target_Information_list113 114 115 def AreaHouse(Area):116     """117 118     @param Area: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息119     @return: 返回房屋房间数量以及房屋总面积120     """121     Area_list = []122     #筛选目标信息的父标签123     Obtain_Area_One = BeautifulSoup(str(Area), 'lxml')124     Area_one = Obtain_Area_One.find_all(class_='resblock-room')125 126     #通过正则表达式去除多余的html标签信息127     Get_Area_One = []128     for c in Area_one:129         Area_Str_1 = re.sub(r'<.>', '', str(c))130         Get_Area_One.append(Area_Str_1)131 132     #通过正则表达式去除多余的换行符133     Get_Area_Two = []134     for i in Get_Area_One:135         Area_Str_2 = re.sub(r's+','',str(i))136         Get_Area_Two.append(Area_Str_2)137 138 139     #开始获取房屋总面积140     Obtain_Area_Two = BeautifulSoup(str(Area),'lxml')141     Area_two = Obtain_Area_Two.find_all(class_='resblock-area')142     #通过正则表达式去除多余的html标签信息143     Get_Area_Three = []144     for a in Area_two:145         Area_Str_3 = re.sub(r'<.>', '', str(a))146         Get_Area_Three.append(Area_Str_3)147 148     # 通过正则表达式去除多余的换行符149     Get_Area_Four = []150     for r in Get_Area_Three:151         Area_Str_4 = re.sub(r's+', '', str(r))152         Get_Area_Four.append(Area_Str_4)153 154     # 将Get_Area_Two和Get_Area_Four两个列表中的各项元素分别进行合并并保存在Area_list列表中155     A = min(len(Get_Area_Two), len(Get_Area_Four))156     for i in range(A):157         Area_list.append(Get_Area_Two[i] + Get_Area_Four[i])158 159     return Area_list160 161 def AveragePriceHouse(Average):162     """163     房屋均价164     @param Average:165     @return:166     """167     Average_list = []168     Obtain_Average = BeautifulSoup(str(Average), 'lxml')169     Average_one = Obtain_Average.find_all(class_='number')170     for i in Average_one:171         Get_Span = BeautifulSoup(str(i), 'lxml')172         Average_list.append(Get_Span.string)173 174     return Average_list175 176 177 178 def TotalPriceHouse(Total):179     """180     房屋总价181 182     @param Total:183     @return:184     """185     Total_list = []186     Obtain_Total = BeautifulSoup(str(Total), 'lxml')187     Total_one = Obtain_Total.fjind_all(class_='second')188     for i in Total_one:189         Get_Span = BeautifulSoup(str(i), 'lxml')190         Get_Span_one = Get_Span.string191         Get_Span_two = Get_Span_one.lstrip('总价')192         Total_list.append(Get_Span_two)193 194 195     return Total_list

创建存储文件

1 def Create_File():2     name = ['名称','性质','状态','地址','面积','均价','总价',]3     workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)4     sheet = workbook.add_sheet('shett1', cell_overwrite_ok=True)5     for i in range(len(name)):6         sheet.write(0, i, name[i])7     workbook.save(r'F:实例Python实例爬虫111.xls')8     print("文件创建成功")

2、简单的数据分析并使用饼状图表示房产均价比例

所用到的库:

import pandas as pdimport xlrdimport reimport xlutils.copyimport matplotlib.pyplot as plt

①数据清洗----删除空值行

 1 def ExceptNull(): 2     """ 3     数据清洗第一步:去除表中空值 4     @param df: 传入读取的xls表格数据 5     @return: 保存数据后返回, 6     """ 7     df = pd.DataFrame(pd.read_excel(r'F:实例Python实例爬虫111.xls')) 8     #查找面积列空值,使用99999填充空缺值后删除所在行 9     print(df['面积'].isnull().value_counts())10     df["面积"] = df["面积"].fillna('99999')11     NullKey = df[(df.面积 == '99999')].index.tolist()12     print(NullKey)13     df = df.drop(NullKey)14     print("*"*30)15     print(df['面积'].isnull().value_counts())16 17     print("*"*30)18     #查找总价列空值,使用99999填充空缺值后删除所在行19     print(df['总价'].isnull().value_counts())20     df["总价"] = df["总价"].fillna('99999')21     NullKey1 = df[(df.总价 == '99999')].index.tolist()22     print(NullKey1)23     df = df.drop(NullKey1)24     print("*"*30)25     print(df['总价'].isnull().value_counts())26     df.to_excel('111.xls',index=False,encoding='uf-8')27 28 29     print("修改后数据保存成功")

②数据预处理----将数据转换成易处理格式

 1 def LeiChuli(): 2     Data = xlrd.open_workbook(r"F:实例Python实例爬虫111.xls") 3     ws = xlutils.copy.copy(Data) 4     Table = Data.sheet_by_name("Sheet1") 5     Nrows = Table.nrows 6     list_A = [] 7     for i in range(1,Nrows): 8         A = Table.cell_value(i,6) 9         A_Str = re.sub('/套','',A,Nrows)10         list_A.append(A_Str)11     Replace = []12     for i in range(len(list_A)):13         Price_Str = list_A[i]14         Last_Str = Price_Str[-1]15         if Last_Str == '万':16             A_Str = re.sub('万', '0000', Price_Str, 1)17             Replace.append(A_Str)18         else:19             Replace.append(Price_Str)20     table = ws.get_sheet(0)21     for i in range(len(Replace)):22         table.write(i + 1, 6, Replace[i])23         print("------>开始写入修改后数据第{}项写入成功数据写入完成

③对处理后的数据进行分析并绘制饼状图

 1 def Data_Analysis_One(): 2     Data = xlrd.open_workbook(r"F:实例Python实例爬虫111.xls") 3     ws = xlutils.copy.copy(Data) 4     Table = Data.sheet_by_name("Sheet1") 5     Nrows = Table.nrows 6     a,b,c,d,e,f = 0,0,0,0,0,0 7  8     for i in range(1, Nrows): 9         A = Table.cell_value(i, 5)10         if A == "价格待定":11             f += 112         else:13             if int(A) <= 5000:14                 a += 115             elif int(A) <= 10000:16                 b += 117             elif int(A) <= 15000:18                 c += 119             elif int(A) <= 20000:20                 d += 121             else:22                 e += 123 24     # 开始准备绘制饼状图25 26     #价格区间数据准备27     sizes = []28     Percentage_a = (a / Nrows) * 10029     sizes.append(int(Percentage_a))30     Percentage_b = (b / Nrows) * 10031     sizes.append(int(Percentage_b))32     Percentage_c = (c / Nrows) * 10033     sizes.append(int(Percentage_c))34     Percentage_d = (d / Nrows) * 10035     sizes.append(int(Percentage_d))36     Percentage_e = (e / Nrows) * 10037     sizes.append(int(Percentage_e))38     Percentage_f = (f / Nrows) * 10039     sizes.append(int(Percentage_f))40     #设置占比说明41     labels = '0-5000','5001-10000','10001-15000','15001-20000','20000-','Undetermined'42     explode = (0,0,0.1,0,0,0)43     #开始绘制44     plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=True,startangle=90)45     plt.axis('equal')46     plt.show()47     ws.save(r"F:实例Python实例爬虫111.xls")

最后附上效果图。

9d3e8ab1def18107c5d8bccaf6a67930.png

最后附上完整代码:

1、爬虫代码

  1 import requests  2 import bs4  3 from bs4 import BeautifulSoup  4 import lxml  5 import re  6 # import LianJia_QD_DataProcessing  7 import xlrd  8 import xlwt  9 import xlutils.copy 10 import time 11  12 def GetHTML(Url): 13     """ 14     1、通过传入url组合,获取所有网页地址的url 15     2、获取目标网页的html代码并进行解析 16     3、解析后将目标信息分别写入字典类型的变量并返回 17  18     @param Url: 目标网址的不变链接 19     @return: 网站目标信息 20  21     """ 22  23     #通过传入url组合,获取所有网页地址的url 24     WebDiZhi = [] 25     for i in range(1,85): 26         UrlHTML = Url + str(i) 27         WebDiZhi.append(UrlHTML) 28  29     print("共计{}页".format(len(WebDiZhi))) 30     # Create_File() 31     #获取目标网页的html代码并进行解析 32     Xu = 0 33     Shuliang = len(WebDiZhi) 34     while Xu in range(Shuliang):#range(len(WebDiZhi))--循环整个列表 35  36         Web = requests.get(WebDiZhi[Xu]) 37         WebText = Web.text 38  39         #第一步、粗筛选目标信息所在的html代码,去除大部分无效信息代码 40         soup_One = BeautifulSoup(WebText,'html.parser') 41         XinXi_One = soup_One.find_all(class_="resblock-list-wrapper") 42  43         #第二步、进一步筛选目标信息所在html代码,去除无效信息代码 44         soup_Two = BeautifulSoup(str(XinXi_One),'lxml') 45         XinXi_Two = soup_Two.find_all(class_="resblock-desc-wrapper") 46  47         print("-----------------第{}页爬取成功------------".format(Xu)) 48     #     Html.append(XinXi_Two) 49     #     time.sleep(1) 50     # return Html 51  52         print("-----------------开始写入第{}页-------------".format(Xu)) 53         Name = GetName(XinXi_Two)  # 获取小区名称 54         Write_File(Name, 0,Xu) 55         print("---------小区名称写入成功---------") 56         time.sleep(3) 57         Nature = NatureHouse(XinXi_Two)  # 获取小区住宅性质(住宅、商业性) 58         Write_File(Nature, 1,Xu) 59         print("---------小区性质写入成功---------") 60         time.sleep(3) 61         Status = StatusHouse(XinXi_Two)  # 获取小区状态(在售) 62         Write_File(Status, 2,Xu) 63         print("---------小区状态写入成功---------") 64         time.sleep(3) 65         Address = AddressHouse(XinXi_Two)  # 获取小区地址 66         Write_File(Address, 3,Xu) 67         print("---------小区地址写入成功---------") 68         time.sleep(3) 69         Area = AreaHouse(XinXi_Two)  # 获取小区房屋面积 70         Write_File(Area, 4,Xu) 71         print("---------小区面积写入成功---------") 72         time.sleep(3) 73         Average = AveragePriceHouse(XinXi_Two)  # 均价 74         Write_File(Average, 5,Xu) 75         print("---------小区均价写入成功---------") 76         time.sleep(3) 77         Total = TotalPriceHouse(XinXi_Two)  # 总价 78         Write_File(Total, 6,Xu) 79         print("---------小区总价写入成功---------") 80         time.sleep(3) 81  82         Xu += 1 83  84         # 调用不同函数获取不同信息 85  86  87 def Write_File(Data, lei,Hang): 88     data = xlrd.open_workbook(r"F:实例Python实例爬虫111.xls") 89     ws = xlutils.copy.copy(data) 90     table = ws.get_sheet(0) 91     Shu = Hang * 10 92     for i in range(len(Data)): 93         table.write(i + 1 + Shu, lei, Data[i]) 94         print("----第{}项写入成功----".format(i)) 95         ws.save(r"F:实例Python实例爬虫111.xls") 96  97  98 def GetName(XinXi): 99     """100     @param XinXi: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息101     @return: 返回小区名称,列表类型102     """103     Nmae_list = []104     # 获取小区名称105     Obtain_Name_One = BeautifulSoup(str(XinXi), 'lxml')106     Name_One = Obtain_Name_One.findAll(class_="name")107     for i in Name_One:108         Get_A = BeautifulSoup(str(i), 'lxml')109         Nmae_list.append(Get_A.string)110     return Nmae_list111 112 """113 代码以及目标信息均已获取,通过不同函数将html代码在对应函数中逐一进行解析获取函数对应信息并保存即可114 以下为部分函数,其他函数未定义115 116 """117 def NatureHouse(Nature):118     """房屋性质"""119     Nature_list = []120     Obtain_Nature = BeautifulSoup(str(Nature), 'lxml')121     Nature_one = Obtain_Nature.find_all(class_='resblock-type')122     for i in Nature_one:123         Get_Span = BeautifulSoup(str(i), 'lxml')124         Nature_list.append(Get_Span.string)125     return Nature_list126 127 def StatusHouse(Status):128     """房屋状态"""129     Status_list = []130     Obtain_Nature = BeautifulSoup(str(Status), 'lxml')131     Status_one = Obtain_Nature.find_all(class_='sale-status')132     for i in Status_one:133         Get_Span = BeautifulSoup(str(i), 'lxml')134         Status_list.append(Get_Span.string)135     return Status_list136 137 def AddressHouse(Area):138     """139 140 141     @param Area:传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息142     @return:143     Analysis_Label_xxx:分析标签,xxx:代表第几次分析144     Target_Information_xxx:目标信息,xxx:代表第几个信息部分,总共分为两部分,以及一个整体信息存储列表Target_Information_list145     """146     #获取标签147     Target_Information_list = []148     Analysis_Label_One = BeautifulSoup(str(Area), 'lxml')149     # 获取div标签,calss=resblock-location150     Get_label_One = Analysis_Label_One.find_all(class_='resblock-location')151     #解析标签并获得span标签152     Analysis_Label_Two = BeautifulSoup(str(Get_label_One), 'lxml')153     Get_label_Two = Analysis_Label_Two.find_all(name='span')154 155 156     #获取span标签里面的文字内容并保存在列表内157 158     #第一个159     Target_Information_One = []160     for i in Get_label_Two:161         #使用正则表达式取出内部信息并保存在列表中162         Information_Str = re.sub(r'<.>','',str(i))163         Target_Information_One.append(Information_Str)164     #将列表内相同小区的地址进行合并,使用循环嵌套获取内容、合并最后保存在列表内165     i = 1166     a = 0167 168     #第二个,第二个信息是在第一个信息的基础上合并列表内的元素得来169     Target_Information_Two = []170     while i <= len(Target_Information_One):171         while a < i:172             #将Target_Information_One中每两项进行合并173             Information_Two = Target_Information_One[a]174             Information_One = Target_Information_One[i]175             Information_Three = Information_One + Information_Two176 177             Target_Information_Two.append(Information_Three)178             a += 2179         i += 2180 181 182     #获取详细地址183 184     #第三个185     Target_Information_Three = []186     Span_html_One = Analysis_Label_Two.find_all(name='a')187     for c in Span_html_One:188         Area_Str_1 = re.sub(r'<.>', '', str(c))189         Target_Information_Three.append(Area_Str_1)190 191 192     # 将Target_Information_Two和Target_Information_Three两个列表中的各项元素分别进行合并并保存在Area_list列表中193     A = min(len(Target_Information_Two),len(Target_Information_Three))194     for i in range(A):195         Target_Information_list.append(Target_Information_Two[i] + Target_Information_Three[i])196 197 198     return Target_Information_list199 200 201 def AreaHouse(Area):202     """203 204     @param Area: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息205     @return: 返回房屋房间数量以及房屋总面积206     """207     Area_list = []208     #筛选目标信息的父标签209     Obtain_Area_One = BeautifulSoup(str(Area), 'lxml')210     Area_one = Obtain_Area_One.find_all(class_='resblock-room')211 212     #通过正则表达式去除多余的html标签信息213     Get_Area_One = []214     for c in Area_one:215         Area_Str_1 = re.sub(r'<.>', '', str(c))216         Get_Area_One.append(Area_Str_1)217 218     #通过正则表达式去除多余的换行符219     Get_Area_Two = []220     for i in Get_Area_One:221         Area_Str_2 = re.sub(r's+','',str(i))222         Get_Area_Two.append(Area_Str_2)223 224 225     #开始获取房屋总面积226     Obtain_Area_Two = BeautifulSoup(str(Area),'lxml')227     Area_two = Obtain_Area_Two.find_all(class_='resblock-area')228     #通过正则表达式去除多余的html标签信息229     Get_Area_Three = []230     for a in Area_two:231         Area_Str_3 = re.sub(r'<.>', '', str(a))232         Get_Area_Three.append(Area_Str_3)233 234     # 通过正则表达式去除多余的换行符235     Get_Area_Four = []236     for r in Get_Area_Three:237         Area_Str_4 = re.sub(r's+', '', str(r))238         Get_Area_Four.append(Area_Str_4)239 240     # 将Get_Area_Two和Get_Area_Four两个列表中的各项元素分别进行合并并保存在Area_list列表中241     A = min(len(Get_Area_Two), len(Get_Area_Four))242     for i in range(A):243         Area_list.append(Get_Area_Two[i] + Get_Area_Four[i])244 245     return Area_list246 247 def AveragePriceHouse(Average):248     """249     房屋均价250     @param Average:251     @return:252     """253     Average_list = []254     Obtain_Average = BeautifulSoup(str(Average), 'lxml')255     Average_one = Obtain_Average.find_all(class_='number')256     for i in Average_one:257         Get_Span = BeautifulSoup(str(i), 'lxml')258         Average_list.append(Get_Span.string)259 260     return Average_list261 262 263 264 def TotalPriceHouse(Total):265     """266     房屋总价267 268     @param Total:269     @return:270     """271     Total_list = []272     Obtain_Total = BeautifulSoup(str(Total), 'lxml')273     Total_one = Obtain_Total.fjind_all(class_='second')274     for i in Total_one:275         Get_Span = BeautifulSoup(str(i), 'lxml')276         Get_Span_one = Get_Span.string277         Get_Span_two = Get_Span_one.lstrip('总价')278         Total_list.append(Get_Span_two)279 280 281     return Total_list282 283 284 def Create_File():285     name = ['名称','性质','状态','地址','面积','均价','总价',]286     workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)287     sheet = workbook.add_sheet('shett1', cell_overwrite_ok=True)288     for i in range(len(name)):289         sheet.write(0, i, name[i])290     workbook.save(r'F:实例Python实例爬虫111.xls')291     print("文件创建成功")292 293 294 if __name__ == '__main__':295     url = "https://qd.fang.lianjia.com/loupan/pg"296     Create_File()297     DataHtml = GetHTML(url)298 299     print("全部房产信息写入成功")

2、数据处理代码

  1 import pandas as pd  2 import xlrd  3 import re  4 import xlutils.copy  5 import matplotlib.pyplot as plt  6   7 def ExceptNull():  8     """  9     数据清洗第一步:去除表中空值 10     @param df: 传入读取的xls表格数据 11     @return: 保存数据后返回, 12     """ 13     df = pd.DataFrame(pd.read_excel(r'F:实例Python实例爬虫111.xls')) 14     #查找面积列空值,使用99999填充空缺值后删除所在行 15     print(df['面积'].isnull().value_counts()) 16     df["面积"] = df["面积"].fillna('99999') 17     NullKey = df[(df.面积 == '99999')].index.tolist() 18     print(NullKey) 19     df = df.drop(NullKey) 20     print("*"*30) 21     print(df['面积'].isnull().value_counts()) 22  23     print("*"*30) 24     #查找总价列空值,使用99999填充空缺值后删除所在行 25     print(df['总价'].isnull().value_counts()) 26     df["总价"] = df["总价"].fillna('99999') 27     NullKey1 = df[(df.总价 == '99999')].index.tolist() 28     print(NullKey1) 29     df = df.drop(NullKey1) 30     print("*"*30) 31     print(df['总价'].isnull().value_counts()) 32     df.to_excel('111.xls',index=False,encoding='uf-8') 33  34  35     print("修改后数据保存成功") 36  37  38 def LeiChuli(): 39     Data = xlrd.open_workbook(r"F:实例Python实例爬虫111.xls") 40     ws = xlutils.copy.copy(Data) 41     Table = Data.sheet_by_name("Sheet1") 42     Nrows = Table.nrows 43     list_A = [] 44     for i in range(1,Nrows): 45         A = Table.cell_value(i,6) 46         A_Str = re.sub('/套','',A,Nrows) 47         list_A.append(A_Str) 48     Replace = [] 49     for i in range(len(list_A)): 50         Price_Str = list_A[i] 51         Last_Str = Price_Str[-1] 52         if Last_Str == '万': 53             A_Str = re.sub('万', '0000', Price_Str, 1) 54             Replace.append(A_Str) 55         else: 56             Replace.append(Price_Str) 57     table = ws.get_sheet(0) 58     for i in range(len(Replace)): 59         table.write(i + 1, 6, Replace[i]) 60         print("------>开始写入修改后数据第{}项写入成功数据写入完成

数据来源于链家青岛站部分数据,因为一些原因爬取结果可能不是完全符合预期。

私信小编01 我教你爬虫和数据分析呀 源码获取哦!

分享到 :
0 人收藏
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

积分:3875789
帖子:775174
精华:0
期权论坛 期权论坛
发布
内容

下载期权论坛手机APP