python3实用编程技巧进阶吾爱破解_新手初学python3编程一些简易代码

论坛 期权论坛 编程之家     
选择匿名的用户   2021-6-2 13:09   312   0

[Python] 纯文本查看 复制代码#-*- coding utf-8 -*-

# coder Thending

# python3.6.4

#以下是需要用到的库

import requests

import json

import re

import pandas

import time

from bs4 import BeautifulSoup

from datetime import datetime

while 1:

y = input('请问要获取几页新浪国内新闻(一页22个):')

if y.isdigit():

print("下载中···")

break

else:

print("输入错误,请重新输入")

continue

x = int(y)

#评论js链接

commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1'

#页面加载链接

url = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1521876473467'

def getCommentsCounts(newsurl):

m = re.search('doc-i(.*).shtml', newsurl)

newsid = m.group(1)

comments = requests.get(commentURL.format(newsid))

jd = json.loads(comments.text.strip('var data='))

return jd['result']['count']['total']

def getNewsDetail(newsurl):

result = {}

res = requests.get(newsurl)

res.encoding = 'utf-8'

soup = BeautifulSoup(res.text, 'html.parser')

time = soup.select('.date')[0].text

times = datetime.strptime(time, '%Y年%m月%d日 %H:%M')

result['时间'] = times.strftime("%Y-%m-%d %H:%M")

result['来源'] = soup.select('.source')[0].text

result['内容'] = ' '.join([p.text.strip() for p in soup.select('.article p')[:-2]])

result['标题'] = soup.select('.main-title')[0].text

result['作者'] = soup.select('.show_author')[0].text.strip('责任编辑:')

result['评论数'] = getCommentsCounts(newsurl)

result['链接'] = newsurl

return result

def parseListLinks(url):

newsdetails = []

res = requests.get(url)

jd = json.loads(res.text.lstrip(' newsloadercallback(').rstrip(');'))

for ent in jd['result']['data']:

newsdetails.append(getNewsDetail(ent['url']))

return newsdetails

news_total = []

for i in range(1, 1+x):

newsurl = url.format(i)

newsary = parseListLinks(newsurl)

news_total.extend(newsary)

df = pandas.DataFrame(news_total)

df.to_csv("sina_news.csv", encoding="utf_8_sig")

if len(news_total) > 0 :

print("已在当前目录下生成CSV文件!")

time.sleep(3)

分享到 :
0 人收藏
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

积分:3875789
帖子:775174
精华:0
期权论坛 期权论坛
发布
内容

下载期权论坛手机APP