- 导入 Pandas 库并简写为
pd ,并输出版本号
>>> import pandas as pd
>>> pd.__version__
'1.1.4'
2.从列表创建series
>>> data = [1,2,3,4,5,6]
>>> frame = pd.Series(data, index = ['A','B','C','D','E','F'])
>>> frame
A 1
B 2
C 3
D 4
E 5
F 6
dtype: int64
3.从字典创建series
>>> data = {'A':1, 'B':2, 'C':3, 'D':4}
>>> frame=pd.Series(data)
>>> frame
A 1
B 2
C 3
D 4
dtype: int64
4.从Numpy数组创建DataFrame
>>> import numpy as np
>>> data = np.arange(6).reshape(2,3)
>>> data
array([[0, 1, 2],
[3, 4, 5]])
>>> frame = pd.DataFrame(data,index=['A','B'],columns=['O','T','C'])
>>> frame
O T C
A 0 1 2
B 3 4 5
5.从csv中创建DataFrame,分隔符为",",编码为“utf8”
>>> frame = pd.read_csv(r'C:\Users\MARS\Desktop\测试数据.csv', sep=',',encoding='utf8')
6.从字典对象创建DataFrame,索引设置为labels
>>> import numpy as np
>>> import pandas as pd
>>> data = {'animal':['cat','cat','snake','dog','god','cat','snake','cat','dog','dog'],'age':[2.5,3,0.5,np.nan,5,2,4.5,np.nan,7,3],'visits':[1,3,2,3,2,3,1,1,2,1],'priority':['yes','yes','no','yes','no','no','no','yse','no','no']}
>>> labels = ['a','b','c','d','e','f','g','h','i','j']
>>> frame = pd.DataFrame(data,index = labels)
>>> frame
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e god 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yse
i dog 7.0 2 no
j dog 3.0 1 no
7.显示DataFrame的基础信息,包括行数,列名,值的数量和类型
>>> frame.info()
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 animal 10 non-null object
1 age 8 non-null float64
2 visits 10 non-null int64
3 priority 10 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 472.0+ bytes
8.展示前三行(两种方式)
>>> frame.iloc[:3]
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
>>> frame.head(3)
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
9.取出frame的animal 和age 列
>>> frame.loc[:,['animal','age']]
animal age
a cat 2.5
b cat 3.0
c snake 0.5
d dog NaN
e god 5.0
f cat 2.0
g snake 4.5
h cat NaN
i dog 7.0
j dog 3.0
>>> frame[['animal','age']]
animal age
a cat 2.5
b cat 3.0
c snake 0.5
d dog NaN
e god 5.0
f cat 2.0
g snake 4.5
h cat NaN
i dog 7.0
j dog 3.0
10.取出索引为[3, 4, 8] 行的animal 和age 列
>>> frame.loc[frame.index[[3,4,8]],['animal','age']]
animal age
d dog NaN
e god 5.0
i dog 7.0
11.取出age 值大于3的行
>>> frame[frame['age']>3]
animal age visits priority
e god 5.0 2 no
g snake 4.5 1 no
i dog 7.0 2 no
12.取出age 值缺失的行
>>> frame[frame['age'].isnull()]
animal age visits priority
d dog NaN 3 yes
h cat NaN 1 yse
13..取出age 在2,4间的行(不含)
>>> frame[(frame['age']>2) &( frame['age']<4)]
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
j dog 3.0 1 no
14.f 行的age 改为1.5
>>> frame.loc['f','age']=1.5
15..计算visits 的总和
>>> frame['visits'].sum()
19
16.计算每个不同种类animal 的age 的平均数
>>> frame.groupby('animal')['age'].mean()
animal
cat 2.333333
dog 5.000000
god 5.000000
snake 2.500000
Name: age, dtype: float64
17.计算df 中每个种类animal 的数量
>>> frame.groupby('animal')['animal'].count()
animal
cat 4
dog 3
god 1
snake 2
Name: animal, dtype: int64
>>> frame['animal'].value_counts()
cat 4
dog 3
snake 2
god 1
Name: animal, dtype: int64
18.先按age 降序排列,后按visits 升序排列
>>> frame.sort_values(by=['age','visits'],ascending=['True','False'])
animal age visits priority
c snake 0.5 2 no
f cat 2.0 3 no
a cat 2.5 1 yes
j dog 3.0 1 no
b cat 3.0 3 yes
g snake 4.5 1 no
e god 5.0 2 no
i dog 7.0 2 no
h cat NaN 1 yse
d dog NaN 3 yes
19.将priority 列中的yes, no 替换为布尔值True, False
>>> frame['priority'] = frame['priority'].map({'yes':True,'no':False})
>>> frame
animal age visits priority
a cat 2.5 1 True
b cat 3.0 3 True
c snake 0.5 2 False
d dog NaN 3 True
e god 5.0 2 False
f cat 2.0 3 False
g snake 4.5 1 False
h cat NaN 1 NaN
i dog 7.0 2 False
j dog 3.0 1 False
20.将animal 列中的snake 替换为python
>>> frame['animal'] = frame['animal'].replace('snake','python')
>>> frame
animal age visits priority
a cat 2.5 1 True
b cat 3.0 3 True
c python 0.5 2 False
d dog NaN 3 True
e god 5.0 2 False
f cat 2.0 3 False
g python 4.5 1 False
h cat NaN 1 NaN
i dog 7.0 2 False
j dog 3.0 1 False
21.对每种animal 的每种不同数量visits ,计算平均age ,即,返回一个表格,行是aniaml 种类,列是visits 数量,表格值是行动物种类列访客数量的平均年龄
>>> frame.groupby(['animal','visits'])['age'].mean()
animal visits
cat 1 2.5
3 2.5
dog 1 3.0
2 7.0
3 NaN
god 2 5.0
python 1 4.5
2 0.5
Name: age, dtype: float64
>>> frame.dtypes
animal object
age float64
visits int64
priority object
dtype: object
>>> frame.age = frame.age.astype(float)
>>> frame.pivot_table(index='animal', columns='visits',values = 'age',aggfunc='mean')
visits 1 2 3
animal
cat 2.5 NaN 2.5
dog 3.0 7.0 NaN
god NaN 5.0 NaN
python 4.5 0.5 NaN
22..在frame中插入新行k ,然后删除该行
>>> frame.loc['K'] = ['cat',5,2,'no']
>>> frame
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e god 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yse
i dog 7.0 2 no
j dog 3.0 1 no
K cat 5.0 2 no
>>> frame.drop('K')
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e god 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yse
i dog 7.0 2 no
j dog 3.0 1 no
23.有一列整数列A 的DatraFrame,删除数值重复的行
>>> df = pd.DataFrame({'A':[1,2,2,3,4,5,5,5,6,6,7,7]})
>>> df
A
0 1
1 2
2 2
3 3
4 4
5 5
6 5
7 5
8 6
9 6
10 7
11 7
>>> df1 = df.loc[df['A'].shift() != df['A']]
>>> df1
A
0 1
1 2
3 3
4 4
5 5
8 6
10 7
>>> df1 = df.drop_duplicates(subset='A')
>>> df1
A
0 1
1 2
3 3
4 4
5 5
8 6
10 7
>>>
24.一个全数值DatraFrame,每个数字减去该行的平均数
>>> df = pd.DataFrame(np.random.random(size=(5, 3)))
>>> df
0 1 2
0 0.679254 0.007655 0.312684
1 0.494291 0.800074 0.506416
2 0.518933 0.822710 0.250825
3 0.748430 0.201992 0.227088
4 0.247771 0.044589 0.569569
>>> df1=df.sub(df.mean(axis=1), axis=0)
>>> df1
0 1 2
0 0.346057 -0.325543 -0.020514
1 -0.105969 0.199814 -0.093845
2 -0.011890 0.291887 -0.279997
3 0.355926 -0.190511 -0.165415
4 -0.039538 -0.242720 0.282259
|