python3 爬取 kaggle-dataset目录

论坛 期权论坛 脚本     
匿名技术用户   2020-12-27 05:41   962   0
import logging
import os

try:
    os.mkdir("../data/")
finally:
    pass

def write_kaggle_datasets(i):
    import requests
    url_src = lambda x: 'https://www.kaggle.com/datasets_v2.json?sortBy=hottest&group=public&page='+ str(x) +'&pageSize=25&size=all&filetype=all&license=all'
    filename = 'dataset_split' + str(i)
    chunk_size = 1024
    with open("./data/" + filename, 'wb') as fd:
        for chunk in requests.get(url_src(i+1)).iter_content(chunk_size):
            fd.write(chunk)
        fd.close()
    logging.warning('已经写入一条数据')
    return True

# write_kaggle_datasets()

def get_json():
    [write_kaggle_datasets(i) for i in range(25)]
    logging.info('Hava_Done!')

import json
def get_by_index(i):
    strings = open("./data/"+ "dataset_split" + str(i), "r+", encoding='utf-8').read()
    res = json.loads(strings)
    return res

def main():
    # get_json()
    res_data = [data for i in range(25) for data in get_by_index(i)["datasetListItems"]]
    import pandas as pd
    df = pd.DataFrame(res_data)
    df.to_csv('./data/dataset_demo.csv')

if __name__ == '__main__':
    main()
分享到 :
0 人收藏
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

积分:7942463
帖子:1588486
精华:0
期权论坛 期权论坛
发布
内容

下载期权论坛手机APP