基于python获取雅虎金融股票数据及相关可视化操作

@TOC 基于python3实现

1.获取数据

1.1单只股票

（1）加载所需的包

import pandas as pd
import pandas_datareader.data as web   
import datetime as dt

如果报错说xx包不存在则安装相应的包，再执行上述代码
(2)设置股票代码及起始时间，获取数据

start = dt.datetime(2010,1,1)#时间2010-1-1
end = dt.datetime.today()
#从雅虎财经读取谷歌google（股票代码：GOOG ）
google = web.DataReader('GOOG', 'yahoo',start,end)
google.tail()

在这里插入图片描述
（3）绘图

import matplotlib.pyplot as plt
%matplotlib inline
df['Adj Close'].plot()  #df[].plot()  绘制所有变量的折线图
plt.show()

在这里插入图片描述
（4）绘制日均线图

from matplotlib.dates import DateFormatter, WeekdayLocator, DayLocator, MONDAY,date2num
from mpl_finance import candlestick_ohlc #matplotlib版本不同包可能不一样
import matplotlib.pyplot as plt

def pandas_candlestick_ohlc(data, stick = "day", otherseries = None):
    """
    :param data: pandas DataFrame object with datetime64 index, and float columns "Open", "High", "Low", and "Close", likely created via DataReader from "yahoo"
    :param stick: A string or number indicating the period of time covered by a single candlestick. Valid string inputs include "day", "week", "month", and "year", ("day" default), and any numeric input indicates the number of trading days included in a period
    :param otherseries: An iterable that will be coerced into a list, containing the columns of dat that hold other series to be plotted as lines
 
    This will show a Japanese candlestick plot for stock data stored in dat, also plotting other series if passed.
    """
    mondays = WeekdayLocator(MONDAY)        # major ticks on the mondays
    alldays = DayLocator()              # minor ticks on the days
    dayFormatter = DateFormatter('%d')      # e.g., 12
    # Create a new DataFrame which includes OHLC data for each period specified by stick input
    transdat = data.loc[:,["Open", "High", "Low", "Close"]]
    if (type(stick) == str):
        if stick == "day":
            plotdat = transdat
            stick = 1 # Used for plotting
        elif stick in ["week", "month", "year"]:
            if stick == "week":
                transdat["week"] = pd.to_datetime(transdat.index).map(lambda x: x.isocalendar()[1]) # Identify weeks
            elif stick == "month":
                transdat["month"] = pd.to_datetime(transdat.index).map(lambda x: x.month) # Identify months
            transdat["year"] = pd.to_datetime(transdat.index).map(lambda x: x.isocalendar()[0]) # Identify years
            grouped = transdat.groupby(list(set(["year",stick]))) # Group by year and other appropriate variable
            plotdat = pd.DataFrame({"Open": [], "High": [], "Low": [], "Close": []}) # Create empty data frame containing what will be plotted
            for name, group in grouped:
                plotdat = plotdat.append(pd.DataFrame({"Open": group.iloc[0,0],
                                            "High": max(group.High),
                                            "Low": min(group.Low),
                                            "Close": group.iloc[-1,3]},
                                           index = [group.index[0]]))
            if stick == "week": stick = 5
            elif stick == "month": stick = 30
            elif stick == "year": stick = 365
 
    elif (type(stick) == int and stick >= 1):
        transdat["stick"] = [np.floor(i / stick) for i in range(len(transdat.index))]
        grouped = transdat.groupby("stick")
        plotdat = pd.DataFrame({"Open": [], "High": [], "Low": [], "Close": []}) # Create empty data frame containing what will be plotted
        for name, group in grouped:
            plotdat = plotdat.append(pd.DataFrame({"Open": group.iloc[0,0],
                                        "High": max(group.High),
                                        "Low": min(group.Low),
                                        "Close": group.iloc[-1,3]},
                                       index = [group.index[0]]))
 
    else:
        raise ValueError('Valid inputs to argument "stick" include the strings "day", "week", "month", "year", or a positive integer')
 
    # Set plot parameters, including the axis object ax used for plotting
    fig, ax = plt.subplots()
    fig.subplots_adjust(bottom=0.2)
    if plotdat.index[-1] - plotdat.index[0] < pd.Timedelta('730 days'):
        weekFormatter = DateFormatter('%b %d')  # e.g., Jan 12
        ax.xaxis.set_major_locator(mondays)
        ax.xaxis.set_minor_locator(alldays)
    else:
        weekFormatter = DateFormatter('%b %d, %Y')
    ax.xaxis.set_major_formatter(weekFormatter)
 
    ax.grid(True)
 
    # Create the candelstick chart
    candlestick_ohlc(ax, list(zip(list(date2num(plotdat.index.tolist())), plotdat["Open"].tolist(), plotdat["High"].tolist(),
                      plotdat["Low"].tolist(), plotdat["Close"].tolist())),
                      colorup = "black", colordown = "red", width = stick * .4)
 
    # Plot other series (such as moving averages) as lines
    if otherseries != None:
        if type(otherseries) != list:
            otherseries = [otherseries]
        data.loc[:,otherseries].plot(ax = ax, lw = 1.3, grid = True)
 
    ax.xaxis_date()
    ax.autoscale_view()
    plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
 
    plt.show()

百度为例

以百度2015-1-1至当前日期的股票数据为例绘图

import pandas as pd
start = dt.datetime(2015,1,1)#时间2010-1-1
end = dt.datetime.today()
baidu = web.DataReader('BIDU', 'yahoo',start,end)
pandas_candlestick_ohlc(baidu)
baidu["20d"] = np.round(baidu["Close"].rolling(window = 20, center = False).mean(), 2)
pandas_candlestick_ohlc(baidu.loc['2019-01-01':'2019-04-22',:], otherseries = "20d")

在这里插入图片描述
不同天数移动平均效果对比

baidu["20d"] = np.round(baidu["Close"].rolling(window = 20, center = False).mean(), 2)
baidu["50d"] = np.round(baidu["Close"].rolling(window = 50, center = False).mean(), 2)
baidu["200d"] = np.round(baidu["Close"].rolling(window = 200, center = False).mean(), 2)
pandas_candlestick_ohlc(baidu.loc['2018-04-01':'2019-04-22',:], otherseries =  ["20d", "50d", "200d"])

在这里插入图片描述

1.2 多只股票

（1）爬取谷歌、百度、阿里巴巴2015-1-1至当天的股票数据（调整过后的价格）

import pandas as pd
import pandas_datareader.data as web   
import datetime as dt
import pandas as pd

start = dt.datetime(2015,1,1)#时间2015-1-1
end = dt.datetime.today()

google=web.get_data_yahoo('GOOG',start,end)
alibaba=web.get_data_yahoo('BABA',start,end)
baidu=web.get_data_yahoo('BIDU',start,end)

stocks = pd.DataFrame({"GOOG": google["Adj Close"],
                      "BABA": alibaba["Adj Close"],
                      "BIDU": baidu["Adj Close"]})
stocks.tail()

在这里插入图片描述
从上面的结果可看出谷歌的调整后价格远高于阿里和百度，因此在绘图时若使用相同尺度的纵坐标则阿里和百度的变化趋势很小，不便于观察，因此单独对阿里和百度设置第二纵坐标。

stocks.plot(secondary_y = ["BABA", "BIDU"], grid = True)

在这里插入图片描述
未设置第二纵坐标时

stocks.plot(grid = True)

在这里插入图片描述
股票对数收益率 $ r_t= log(p_t)-log(p_t-1) $

import numpy as np 
stock_change = stocks.apply(lambda x: np.log(x) - np.log(x.shift(1))) # shift moves dates back by 1.
stock_change.head()
stock_change.plot(grid = True).axhline(y = 0, color = "red", lw = 2)

在这里插入图片描述
（2）以爬虫架构获取更多股票数据
爬取S&P 500 Index 标准普尔500指数英文简写为S&P 500 Index，是记录美国500家上市公司的一个股票指数。这个股票指数由标准普尔公司创建并维护。

前面我们设计了程序爬取一个股票，前提是我们知道这个股票的代码，现在我们要批量下载股票信息，就需要知道这500个股票的代码。

这个只能借助爬虫来完成。

https://en.wikipedia.org/wiki/List_of_S%26P_500_companies

BeautifulSoup4

!pip3 install beautifulsoup4 #jupyter 下安装包

BeautifulSoup是一种非常优雅的专门用于进行HTML/XML数据解析的一种描述语言，可以很好的分析和筛选HTML/XML这样的标记文档中的指定规则数据

pip install beautifulsoup4

然后我们安装lxml，这是一个解析器，BeautifulSoup可以使用它来解析HTML，然后提取内容。
Anaconda 使用下面命令安装lxml： conda install lxml 或 pip install lxml

#爬虫工具：分析网页
import bs4 as bs
# 序列化和反序列化工具
import pickle
#python的一个HTTP客户端库
import requests

#获取普尔500指数的股票代码，并保存结果
def get_sp500_tickers():
    resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, "lxml")
    #//*[@id="constituents"]/tbody/tr[1]/td[1]/a  //*[@id="constituents"]/tbody/tr[1]/td[8]
    #//*[@id="constituents"]/tbody/tr[2]/td[1]/a  //*[@id="constituents"]/tbody/tr[2]/td[8]
    table = soup.find('table', {'class' : 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[1].text
        tickers.append(ticker)
    print(tickers)
    
    with open("sp500tickers45482.pickle","wb") as f:#pickle序列化工具，进行打包
        pickle.dump(tickers, f)#反序列化

获取上述sp500tickers45482.pickle 保存的股票代码，并爬取每个在2010-1-1至当前日期的股票数据

import pandas_datareader as web
import datetime as dt
import os
import pickle
def get_data_from_yahoo(reload_sp500 = False):
    if reload_sp500:
        tickers  = save_sp500_tickers()
    else:
        with open("sp500tickers45482.pickle","rb") as f:
            tickers = pickle.load(f)
    
    if not os.path.exists('stock_dfs45482'):
        os.makedirs('stock_dfs45482')
        
    start = dt.datetime(2010,1,1)
    end = dt.datetime.today()
    num = 0
    for ticker in tickers:
        num += 1
        if('.B'not in ticker):#####################
            if not os.path.exists('stock_dfs45482/{}.csv'.format(ticker)):
                df = web.DataReader(ticker, 'yahoo', start, end)
                df.to_csv('stock_dfs45482/{}.csv'.format(ticker))
                print("正在下载{0:>5}...{1} / 500".format(ticker,num))
            
            else:
                print("文件{0:>5}已经存在. {1}/500".format(ticker,num))
        else: #############################
            print("error sample:"+ticker)

基于python获取雅虎金融股票数据及相关可视化操作

1.获取数据

1.1单只股票

百度为例

1.2 多只股票

浏览过的版块