print('how many lines in train set:', sales_train.shape)print('unique items in train set:', sales_train['item_id'].nunique())print('unique shops in train set:', sales_train['shop_id'].nunique())print('how many lines in test set:', test.shape)print('unique items in test set:', test['item_id'].nunique())print('unique shops in test set:', test['shop_id'].nunique())
how many lines in train set: (2935849, 6)
unique items in train set: 21807
unique shops in train set: 60
how many lines in test set: (214200, 3)
unique items in test set: 5100
unique shops in test set: 42
test = pd.read_csv('../readonly/final_project_data/test.csv.gz')
test = pd.merge(test,grouped, on =['shop_id','item_id'], how ='left')print(test.head())
test['item_cnt_month']= test['item_cnt_month'].fillna(0).clip(0,20)print(test.head())
test = test[['ID','item_cnt_month']]
submission = test.set_index('ID')
submission.to_csv('submission_baseline.csv')
ID shop_id item_id item_cnt_month
0 0 5 5037 NaN
1 1 5 5320 NaN
2 2 5 5233 1.0
3 3 5 5232 NaN
4 4 5 5268 NaN
ID shop_id item_id item_cnt_month
0 0 5 5037 0.0
1 1 5 5320 0.0
2 2 5 5233 1.0
3 3 5 5232 0.0
4 4 5 5268 0.0
defdowncast_dtypes(df):
cols_float64 =[c for c in df if df[c].dtype =='float64']
cols_int64_32 =[c for c in df if df[c].dtype in['int64','int32']]
df[cols_float64]= df[cols_float64].astype(np.float32)
df[cols_int64_32]= df[cols_int64_32].astype(np.int16)return df
sales_train = downcast_dtypes(sales_train)
test = downcast_dtypes(test)
sales_train.info()
outdated_items = sales_by_item_id[sales_by_item_id.loc[:,'27':].sum(axis=1)==0]print('Outdated items:',len(outdated_items))
test = pd.read_csv('../readonly/final_project_data/test.csv.gz')print('unique items in test set:', test['item_id'].nunique())print('Outdated items in test set:', test[test['item_id'].isin(outdated_items['item_id'])]['item_id'].nunique())
Outdated items: 12391
unique items in test set: 5100
Outdated items in test set: 164
在训练集种有6行,是重复出现的,我们可以删除或者保留,这种数据对结果影响不大。
print("duplicated lines in sales_train is",len(sales_train[sales_train.duplicated()]))
sales_by_shop_id = sales_train.pivot_table(index=['shop_id'],values=['item_cnt_day'],
columns='date_block_num', aggfunc=np.sum, fill_value=0).reset_index()
sales_by_shop_id.columns = sales_by_shop_id.columns.droplevel().map(str)
sales_by_shop_id = sales_by_shop_id.reset_index(drop=True).rename_axis(None, axis=1)
sales_by_shop_id.columns.values[0]='shop_id'for i inrange(27,34):print('Not exists in month',i,sales_by_shop_id['shop_id'][sales_by_shop_id.loc[:,'0':str(i)].sum(axis=1)==0].unique())for i inrange(27,34):print('Shop is outdated for month',i,sales_by_shop_id['shop_id'][sales_by_shop_id.loc[:,str(i):].sum(axis=1)==0].unique())
Not exists in month 27 [36]
Not exists in month 28 [36]
Not exists in month 29 [36]
Not exists in month 30 [36]
Not exists in month 31 [36]
Not exists in month 32 [36]
Not exists in month 33 []
Shop is outdated for month 27 [ 0 1 8 11 13 17 23 30 32 40 43]
Shop is outdated for month 28 [ 0 1 8 11 13 17 23 30 32 33 40 43 54]
Shop is outdated for month 29 [ 0 1 8 11 13 17 23 29 30 32 33 40 43 54]
Shop is outdated for month 30 [ 0 1 8 11 13 17 23 29 30 32 33 40 43 54]
Shop is outdated for month 31 [ 0 1 8 11 13 17 23 29 30 32 33 40 43 54]
Shop is outdated for month 32 [ 0 1 8 11 13 17 23 29 30 32 33 40 43 54]
Shop is outdated for month 33 [ 0 1 8 11 13 17 23 27 29 30 32 33 40 43 51 54]
<matplotlib.axes._subplots.AxesSubplot at 0x1e080864a20>
有一个商品的价格是负值,将其填充为median。
sales_train[sales_train['item_price']<0]
date
date_block_num
shop_id
item_id
item_price
item_cnt_day
484683
15.05.2013
4
32
2973
-1.0
1.0
median = sales_train[(sales_train['date_block_num']==4)&(sales_train['shop_id']==32)\
&(sales_train['item_id']==2973)&(sales_train['item_price']>0)].item_price.median()
sales_train.loc[sales_train['item_price']<0,'item_price']= median
print(median)
test = pd.read_csv('../readonly/final_project_data/test.csv.gz')
good_sales = test.merge(sales_train, on=['item_id','shop_id'], how='left').dropna()
good_pairs = test[test['ID'].isin(good_sales['ID'])]
no_data_items = test[~(test['item_id'].isin(sales_train['item_id']))]print('1. Number of good pairs:',len(good_pairs))print('2. No Data Items:',len(no_data_items))print('3. Only Item_id Info:',len(test)-len(no_data_items)-len(good_pairs))
1. Number of good pairs: 111404
2. No Data Items: 15246
3. Only Item_id Info: 87550
shops['shop_name']= shops['shop_name'].apply(lambda x: x.lower()).str.replace('[^\w\s]','').str.replace('\d+','').str.strip()
shops['shop_city']= shops['shop_name'].str.partition(' ')[0]
shops['shop_type']= shops['shop_name'].apply(lambda x:'мтрц'if'мтрц'in x else'трц'if'трц'in x else'трк'if'трк'in x else'тц'if'тц'in x else'тк'if'тк'in x else'NO_DATA')
shops.head()
ts = time.time()
group = sales_train.groupby(['item_id']).agg({'item_price':['mean']})
group.columns =['item_avg_item_price']
group.reset_index(inplace=True)
matrix = pd.merge(matrix, group, on=['item_id'], how='left')
matrix['item_avg_item_price']= matrix['item_avg_item_price'].astype(np.float16)
group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price':['mean']})
group.columns =['date_item_avg_item_price']
group.reset_index(inplace=True)
matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_price']= matrix['date_item_avg_item_price'].astype(np.float16)
lags =[1,2,3,4,5,6,12]
matrix = lag_feature(matrix, lags,'date_item_avg_item_price')for i in lags:
matrix['delta_price_lag_'+str(i)]= \
(matrix['date_item_avg_item_price_lag_'+str(i)]- matrix['item_avg_item_price'])/ matrix['item_avg_item_price']defselect_trend(row):for i in lags:if row['delta_price_lag_'+str(i)]:return row['delta_price_lag_'+str(i)]return0
matrix['delta_price_lag']= matrix.apply(select_trend, axis=1)
matrix['delta_price_lag']= matrix['delta_price_lag'].astype(np.float16)
matrix['delta_price_lag'].fillna(0, inplace=True)# https://stackoverflow.com/questions/31828240/first-non-null-value-per-row-from-a-list-of-pandas-columns/31828559# matrix['price_trend'] = matrix[['delta_price_lag_1','delta_price_lag_2','delta_price_lag_3']].bfill(axis=1).iloc[:, 0]# Invalid dtype for backfill_2d [float16]
fetures_to_drop =['item_avg_item_price','date_item_avg_item_price']for i in lags:
fetures_to_drop +=['date_item_avg_item_price_lag_'+str(i)]
fetures_to_drop +=['delta_price_lag_'+str(i)]
matrix.drop(fetures_to_drop, axis=1, inplace=True)
time.time()- ts
601.2605240345001
3.2.13 每个月天数
matrix['month']= matrix['date_block_num']%12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days']= matrix['month'].map(days).astype(np.int8)
3.2.14 开始和结束的销量
Months since the last sale for each shop/item pair and for item only. I use programing approach.
Create HashTable with key equals to {shop_id,item_id} and value equals to date_block_num. Iterate data from the top. Foreach row if {row.shop_id,row.item_id} is not present in the table, then add it to the table and set its value to row.date_block_num. if HashTable contains key, then calculate the difference beteween cached value and row.date_block_num.
ts = time.time()deffill_na(df):for col in df.columns:if('_lag_'in col)&(df[col].isnull().any()):if('item_cnt'in col):
df[col].fillna(0, inplace=True)return df
matrix = fill_na(matrix)
time.time()- ts
4. 数据建模
4.1 lightgbm 模型
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import gc
import pickle
from itertools import product
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',100)#import sklearn.model_selection.KFold as KFold