第一章:第一節資料載入及初步觀察-課程

瑟兰迪尔·绿叶發表於2024-03-12
import numpy as np
import pandas as pd
import os
df=pd.read_csv('train.csv')#拿出train.csv中的訓練集,使用相對路徑
df.shape#維數
(891, 12)
df.T#行與列交換
0 1 2 3 4 5 6 7 8 9 ... 881 882 883 884 885 886 887 888 889 890
PassengerId 1 2 3 4 5 6 7 8 9 10 ... 882 883 884 885 886 887 888 889 890 891
Survived 0 1 1 1 0 0 0 0 1 1 ... 0 0 0 0 0 0 1 0 1 0
Pclass 3 1 3 1 3 3 1 3 3 2 ... 3 3 2 3 3 2 1 3 1 3
Name Braund, Mr. Owen Harris Cumings, Mrs. John Bradley (Florence Briggs Th... Heikkinen, Miss. Laina Futrelle, Mrs. Jacques Heath (Lily May Peel) Allen, Mr. William Henry Moran, Mr. James McCarthy, Mr. Timothy J Palsson, Master. Gosta Leonard Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) Nasser, Mrs. Nicholas (Adele Achem) ... Markun, Mr. Johann Dahlberg, Miss. Gerda Ulrika Banfield, Mr. Frederick James Sutehall, Mr. Henry Jr Rice, Mrs. William (Margaret Norton) Montvila, Rev. Juozas Graham, Miss. Margaret Edith Johnston, Miss. Catherine Helen "Carrie" Behr, Mr. Karl Howell Dooley, Mr. Patrick
Sex male female female female male male male male female female ... male female male male female male female female male male
Age 22.0 38.0 26.0 35.0 35.0 NaN 54.0 2.0 27.0 14.0 ... 33.0 22.0 28.0 25.0 39.0 27.0 19.0 NaN 26.0 32.0
SibSp 1 1 0 1 0 0 0 3 0 1 ... 0 0 0 0 0 0 0 1 0 0
Parch 0 0 0 0 0 0 0 1 2 0 ... 0 0 0 0 5 0 0 2 0 0
Ticket A/5 21171 PC 17599 STON/O2. 3101282 113803 373450 330877 17463 349909 347742 237736 ... 349257 7552 C.A./SOTON 34068 SOTON/OQ 392076 382652 211536 112053 W./C. 6607 111369 370376
Fare 7.25 71.2833 7.925 53.1 8.05 8.4583 51.8625 21.075 11.1333 30.0708 ... 7.8958 10.5167 10.5 7.05 29.125 13.0 30.0 23.45 30.0 7.75
Cabin NaN C85 NaN C123 NaN NaN E46 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN B42 NaN C148 NaN
Embarked S C S S S Q S S S C ... S S S S Q S S S C Q

12 rows × 891 columns

path=os.path.abspath('train.csv')#查詢絕對路徑
pd.read_csv(path)#使用絕對路徑
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

pd.read_table(path)#預設豎線為分隔符
PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0 1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/...
1 2,1,1,"Cumings, Mrs. John Bradley (Florence Br...
2 3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,S...
3 4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May ...
4 5,0,3,"Allen, Mr. William Henry",male,35,0,0,3...
... ...
886 887,0,2,"Montvila, Rev. Juozas",male,27,0,0,21...
887 888,1,1,"Graham, Miss. Margaret Edith",female,...
888 889,0,3,"Johnston, Miss. Catherine Helen ""Car...
889 890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,11...
890 891,0,3,"Dooley, Mr. Patrick",male,32,0,0,3703...

891 rows × 1 columns

pd.read_table(path,sep=',')#將預設改為逗號為分隔符
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

df=pd.read_csv('train.csv',chunksize=1000)#chunksize為迭代數量,10000為迭代一次的數量
 for i in df:#得到逐塊資料
     print(i)
     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ...    ...   
886                              Montvila, Rev. Juozas    male  27.0      0   
887                       Graham, Miss. Margaret Edith  female  19.0      0   
888           Johnston, Miss. Catherine Helen "Carrie"  female   NaN      1   
889                              Behr, Mr. Karl Howell    male  26.0      0   
890                                Dooley, Mr. Patrick    male  32.0      0   

     Parch            Ticket     Fare Cabin Embarked  
0        0         A/5 21171   7.2500   NaN        S  
1        0          PC 17599  71.2833   C85        C  
2        0  STON/O2. 3101282   7.9250   NaN        S  
3        0            113803  53.1000  C123        S  
4        0            373450   8.0500   NaN        S  
..     ...               ...      ...   ...      ...  
886      0            211536  13.0000   NaN        S  
887      0            112053  30.0000   B42        S  
888      2        W./C. 6607  23.4500   NaN        S  
889      0            111369  30.0000  C148        C  
890      0            370376   7.7500   NaN        Q  

[891 rows x 12 columns]
df.get_chunk()#得到逐塊資料(方法2)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

df=pd.read_csv('train.csv')
df.columns=['乘客ID','是否倖存','乘客等級(1/2/3等艙位)','乘客姓名','性別','年齡','堂兄弟/妹個數','父母與小孩個數','船票資訊','票價','客艙','登船港口']
#將表頭改成中文,索引改為乘客ID,直接進行替換
df
乘客ID 是否倖存 乘客等級(1/2/3等艙位) 乘客姓名 性別 年齡 堂兄弟/妹個數 父母與小孩個數 船票資訊 票價 客艙 登船港口
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

df=pd.read_csv('train.csv',names=['乘客ID','是否倖存','乘客等級(1/2/3等艙位)','乘客姓名','性別','年齡','堂兄弟/妹個數','父母與小孩個數','船票資訊','票價','客艙','登船港口'])
#將表頭改成中文,索引改為乘客ID,相當於多加一個表頭
df
乘客ID 是否倖存 乘客等級(1/2/3等艙位) 乘客姓名 性別 年齡 堂兄弟/妹個數 父母與小孩個數 船票資訊 票價 客艙 登船港口
0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
1 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.25 NaN S
2 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
3 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.925 NaN S
4 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1 C123 S
... ... ... ... ... ... ... ... ... ... ... ... ...
887 887 0 2 Montvila, Rev. Juozas male 27 0 0 211536 13 NaN S
888 888 1 1 Graham, Miss. Margaret Edith female 19 0 0 112053 30 B42 S
889 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.45 NaN S
890 890 1 1 Behr, Mr. Karl Howell male 26 0 0 111369 30 C148 C
891 891 0 3 Dooley, Mr. Patrick male 32 0 0 370376 7.75 NaN Q

892 rows × 12 columns

df.info()#檢視資料的基本資訊
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   乘客ID            892 non-null    object
 1   是否倖存            892 non-null    object
 2   乘客等級(1/2/3等艙位)  892 non-null    object
 3   乘客姓名            892 non-null    object
 4   性別              892 non-null    object
 5   年齡              715 non-null    object
 6   堂兄弟/妹個數         892 non-null    object
 7   父母與小孩個數         892 non-null    object
 8   船票資訊            892 non-null    object
 9   票價              892 non-null    object
 10  客艙              205 non-null    object
 11  登船港口            890 non-null    object
dtypes: object(12)
memory usage: 83.8+ KB
df.describe()#檢視資料的基本資訊
乘客ID 是否倖存 乘客等級(1/2/3等艙位) 乘客姓名 性別 年齡 堂兄弟/妹個數 父母與小孩個數 船票資訊 票價 客艙 登船港口
count 892 892 892 892 892 715 892 892 892 892 205 890
unique 892 3 4 892 3 89 8 8 682 249 148 4
top PassengerId 0 3 Name male 24 0 0 1601 8.05 C23 C25 C27 S
freq 1 549 491 1 577 30 608 678 7 43 4 644
df.head(15)#前十五個資料
乘客ID 是否倖存 乘客等級(1/2/3等艙位) 乘客姓名 性別 年齡 堂兄弟/妹個數 父母與小孩個數 船票資訊 票價 客艙 登船港口
0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
1 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.25 NaN S
2 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
3 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.925 NaN S
4 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1 C123 S
5 5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.05 NaN S
6 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
7 7 0 1 McCarthy, Mr. Timothy J male 54 0 0 17463 51.8625 E46 S
8 8 0 3 Palsson, Master. Gosta Leonard male 2 3 1 349909 21.075 NaN S
9 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27 0 2 347742 11.1333 NaN S
10 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14 1 0 237736 30.0708 NaN C
11 11 1 3 Sandstrom, Miss. Marguerite Rut female 4 1 1 PP 9549 16.7 G6 S
12 12 1 1 Bonnell, Miss. Elizabeth female 58 0 0 113783 26.55 C103 S
13 13 0 3 Saundercock, Mr. William Henry male 20 0 0 A/5. 2151 8.05 NaN S
14 14 0 3 Andersson, Mr. Anders Johan male 39 1 5 347082 31.275 NaN S
df.tail(15)#後十五個資料
乘客ID 是否倖存 乘客等級(1/2/3等艙位) 乘客姓名 性別 年齡 堂兄弟/妹個數 父母與小孩個數 船票資訊 票價 客艙 登船港口
877 877 0 3 Gustafsson, Mr. Alfred Ossian male 20 0 0 7534 9.8458 NaN S
878 878 0 3 Petroff, Mr. Nedelio male 19 0 0 349212 7.8958 NaN S
879 879 0 3 Laleff, Mr. Kristo male NaN 0 0 349217 7.8958 NaN S
880 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56 0 1 11767 83.1583 C50 C
881 881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25 0 1 230433 26 NaN S
882 882 0 3 Markun, Mr. Johann male 33 0 0 349257 7.8958 NaN S
883 883 0 3 Dahlberg, Miss. Gerda Ulrika female 22 0 0 7552 10.5167 NaN S
884 884 0 2 Banfield, Mr. Frederick James male 28 0 0 C.A./SOTON 34068 10.5 NaN S
885 885 0 3 Sutehall, Mr. Henry Jr male 25 0 0 SOTON/OQ 392076 7.05 NaN S
886 886 0 3 Rice, Mrs. William (Margaret Norton) female 39 0 5 382652 29.125 NaN Q
887 887 0 2 Montvila, Rev. Juozas male 27 0 0 211536 13 NaN S
888 888 1 1 Graham, Miss. Margaret Edith female 19 0 0 112053 30 B42 S
889 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.45 NaN S
890 890 1 1 Behr, Mr. Karl Howell male 26 0 0 111369 30 C148 C
891 891 0 3 Dooley, Mr. Patrick male 32 0 0 370376 7.75 NaN Q
df.isnull()#true表示資料為空
乘客ID 是否倖存 乘客等級(1/2/3等艙位) 乘客姓名 性別 年齡 堂兄弟/妹個數 父母與小孩個數 船票資訊 票價 客艙 登船港口
0 False False False False False False False False False False True False
1 False False False False False False False False False False False False
2 False False False False False False False False False False True False
3 False False False False False False False False False False False False
4 False False False False False False False False False False True False
... ... ... ... ... ... ... ... ... ... ... ... ...
886 False False False False False False False False False False True False
887 False False False False False False False False False False False False
888 False False False False False True False False False False True False
889 False False False False False False False False False False False False
890 False False False False False False False False False False True False

891 rows × 12 columns

df.to_csv('train_chinese.csv')
s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])#隨機生成1維資料
s
a   -0.557197
b    1.348717
c    0.228413
d    0.356737
e   -0.123567
dtype: float64
s1=pd.Series([1,2,3,4,5],index=['a','b','c','d','e'])#生成1維資料
s1
pandas.core.series.Series
s=pd.Series({'a':1,'b':2,'c':3,'d':4,'e':5})#將字典實列化
s
a    1
b    2
c    3
d    4
e    5
dtype: int64
d={'one':pd.Series([1,2,3,4,5],index=['a','b','c','d','e']),'two':pd.Series([6,7,8,9,10],index=['a','b','c','d','e'])}
d#生成二維資料
{'one': a    1
 b    2
 c    3
 d    4
 e    5
 dtype: int64,
 'two': a     6
 b     7
 c     8
 d     9
 e    10
 dtype: int64}
pd.DataFrame(d)#生成有序列表
one two
a 1 6
b 2 7
c 3 8
d 4 9
e 5 10
df=pd.read_csv('train.csv')
df.columns#檢視df每一列的項
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
df.Cabin#檢視Cabin列的所有項(方法1)
0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object
df['Cabin']#檢視Cabin列的所有項(方法2),返回型別為Series型別
0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object
df[['Cabin']]#返回型別為DataFrame型別
Cabin
0 NaN
1 C85
2 NaN
3 C123
4 NaN
... ...
886 NaN
887 B42
888 NaN
889 C148
890 NaN

891 rows × 1 columns

test_1=pd.read_csv('test_1.csv')
test_1
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked a
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 100
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 100
2 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 100
3 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 100
4 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 100
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S 100
887 887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 100
888 888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S 100
889 889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 100
890 890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q 100

891 rows × 14 columns

del test_1['a']#刪除a列資料(方法1)
test_1
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 13 columns

a=test_1.pop('a')#刪除a列資料(方法2),a表示刪除的東西
test_1
0      100
1      100
2      100
3      100
4      100
      ... 
886    100
887    100
888    100
889    100
890    100
Name: a, Length: 891, dtype: int64
test_1.drop(['a'],axis=1)#刪除a列資料(方法3),axis=1表示列,axis=0表示行(返回的是副本,不是test_1本身)
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 13 columns

test_1.drop(['a'],axis=1,inplace=True)#inplace=True表示返回的是test_1本身,母本進行改變
test_1
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 13 columns

test_1=pd.read_csv('test_1.csv')
test_1
test_1.drop(['a','PassengerId','Survived','Pclass'],axis=1)#返回的是沒有以上元素的副本
Unnamed: 0 Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 2 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 4 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ...
886 886 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 887 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 888 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 889 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 890 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 10 columns

test_1=pd.read_csv('test_1.csv')
test_1['Age'] < 10 #當Age小於10時返回True,其餘返回Faulse
0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool
test_1[test_1['Age'] < 10]#將結果為True的拿出來
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked a
7 7 8 0 3 Palsson, Master. Gosta Leonard male 2.00 3 1 349909 21.0750 NaN S 100
10 10 11 1 3 Sandstrom, Miss. Marguerite Rut female 4.00 1 1 PP 9549 16.7000 G6 S 100
16 16 17 0 3 Rice, Master. Eugene male 2.00 4 1 382652 29.1250 NaN Q 100
24 24 25 0 3 Palsson, Miss. Torborg Danira female 8.00 3 1 349909 21.0750 NaN S 100
43 43 44 1 2 Laroche, Miss. Simonne Marie Anne Andree female 3.00 1 2 SC/Paris 2123 41.5792 NaN C 100
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
827 827 828 1 2 Mallet, Master. Andre male 1.00 0 2 S.C./PARIS 2079 37.0042 NaN C 100
831 831 832 1 2 Richards, Master. George Sibley male 0.83 1 1 29106 18.7500 NaN S 100
850 850 851 0 3 Andersson, Master. Sigvard Harald Elias male 4.00 4 2 347082 31.2750 NaN S 100
852 852 853 0 3 Boulos, Miss. Nourelain female 9.00 1 1 2678 15.2458 NaN C 100
869 869 870 1 3 Johnson, Master. Harold Theodor male 4.00 1 1 347742 11.1333 NaN S 100

62 rows × 14 columns

midage=test_1[(test_1['Age'] > 10) & (test_1['Age'] < 50)]#返回Age大於10歲,且Age小於50歲的結果
midage.head()
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked a
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 100
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 100
2 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 100
3 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 100
4 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 100
midage1=test_1[(test_1['Age'] > 50) | (test_1['Age'] < 10)]#返回Age小於10歲,和Age大於50歲的結果
midage1
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked a
6 6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S 100
7 7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S 100
10 10 11 1 3 Sandstrom, Miss. Marguerite Rut female 4.0 1 1 PP 9549 16.7000 G6 S 100
11 11 12 1 1 Bonnell, Miss. Elizabeth female 58.0 0 0 113783 26.5500 C103 S 100
15 15 16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55.0 0 0 248706 16.0000 NaN S 100
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
851 851 852 0 3 Svensson, Mr. Johan male 74.0 0 0 347060 7.7750 NaN S 100
852 852 853 0 3 Boulos, Miss. Nourelain female 9.0 1 1 2678 15.2458 NaN C 100
857 857 858 1 1 Daly, Mr. Peter Denis male 51.0 0 0 113055 26.5500 E17 S 100
869 869 870 1 3 Johnson, Master. Harold Theodor male 4.0 1 1 347742 11.1333 NaN S 100
879 879 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.0 0 1 11767 83.1583 C50 C 100

126 rows × 14 columns

midage.loc[[100],['Pclass','Sex']]#取出midage中索引為100的資料並拿出Pclass和Sex的結果(並不是第100個資料)
#索引還是與拿過來時的索引一樣,不能代表排列順序
Pclass Sex
100 3 female
midage.to_csv('midage.csv')#將生成的midage表格儲存
midage1=midage.reset_index(drop=True)#將midage中的索引改成由0開始的連續排列索引(drop=True是為了刪除原來索引,不加的話原來的索引會變成資料)
midage1.to_csv('midage1.csv')
midage1.loc[[100],['Pclass','Sex']]#此時取出的是真正的第100個資料
Pclass Sex
100 2 male
midage.index
Index([  0,   1,   2,   3,   4,   8,   9,  12,  13,  14,
       ...
       880, 881, 882, 883, 884, 885, 886, 887, 889, 890],
      dtype='int64', length=576)
midage1.loc[[100,105,108],['Pclass','Name','Sex']]#取出第100,105,108個資料
Pclass Name Sex
100 2 Byles, Rev. Thomas Roussel Davids male
105 3 Cribb, Mr. John Hatfield male
108 3 Calic, Mr. Jovo male
midage1.iloc[[100,105,108],[3,4,5]]#用iloc表示將輸出行的名稱更改為列索引相對應的列數,如Pclass用3表示因為Pclass在從左往右數第3個(從0開始數)
Pclass Name Sex
100 2 Byles, Rev. Thomas Roussel Davids male
105 3 Cribb, Mr. John Hatfield male
108 3 Calic, Mr. Jovo male
midage1.head(3)
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked a
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 100
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 100
2 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 100
sample = pd.DataFrame(np.random.randn(6,4),
                      index=list('632451'),
                     columns=list('BDCA'))
sample#隨機生成一個6*4的矩陣,並將其改為行索引為123456,列索引叫ABCD的表格
B D C A
6 -1.240358 -1.264760 0.530587 -0.641027
3 -0.774196 0.063058 0.908744 -0.111125
2 -0.743644 -0.142332 1.577287 0.409604
4 3.055682 -0.207313 1.322307 0.008019
5 1.191964 0.791010 0.306310 1.400323
1 0.113995 0.514866 -0.219842 -0.149131
sample.sort_values('B')#按B從小到大排列,axis=0預設按行排序,ascending=False表示按降序排列,inplace=True表示將母本替換
B D C A
6 -1.240358 -1.264760 0.530587 -0.641027
3 -0.774196 0.063058 0.908744 -0.111125
2 -0.743644 -0.142332 1.577287 0.409604
1 0.113995 0.514866 -0.219842 -0.149131
5 1.191964 0.791010 0.306310 1.400323
4 3.055682 -0.207313 1.322307 0.008019
sample.sort_index()#按行索引順序排列
B D C A
1 0.113995 0.514866 -0.219842 -0.149131
2 -0.743644 -0.142332 1.577287 0.409604
3 -0.774196 0.063058 0.908744 -0.111125
4 3.055682 -0.207313 1.322307 0.008019
5 1.191964 0.791010 0.306310 1.400323
6 -1.240358 -1.264760 0.530587 -0.641027
sample.sort_index(axis=1)#按列順序排列
A B C D
6 -0.641027 -1.240358 0.530587 -1.264760
3 -0.111125 -0.774196 0.908744 0.063058
2 0.409604 -0.743644 1.577287 -0.142332
4 0.008019 3.055682 1.322307 -0.207313
5 1.400323 1.191964 0.306310 0.791010
1 -0.149131 0.113995 -0.219842 0.514866
sample.sort_index(axis=1,ascending=False)#按列降序排列
D C B A
6 -1.264760 0.530587 -1.240358 -0.641027
3 0.063058 0.908744 -0.774196 -0.111125
2 -0.142332 1.577287 -0.743644 0.409604
4 -0.207313 1.322307 3.055682 0.008019
5 0.791010 0.306310 1.191964 1.400323
1 0.514866 -0.219842 0.113995 -0.149131
sample.sort_values(['B','A'],ascending=False)#任選兩列資料降序排列
B D C A
4 3.055682 -0.207313 1.322307 0.008019
5 1.191964 0.791010 0.306310 1.400323
1 0.113995 0.514866 -0.219842 -0.149131
2 -0.743644 -0.142332 1.577287 0.409604
3 -0.774196 0.063058 0.908744 -0.111125
6 -1.240358 -1.264760 0.530587 -0.641027
df=pd.read_csv('train.csv')
df.columns=['乘客ID','是否倖存','乘客等級(1/2/3等艙位)','乘客姓名','性別','年齡','堂兄弟/妹個數','父母與小孩個數','船票資訊','票價','客艙','登船港口']
df.sort_values(['票價','年齡'],ascending=False)#先優先按票價排序,在按年齡排序
乘客ID 是否倖存 乘客等級(1/2/3等艙位) 乘客姓名 性別 年齡 堂兄弟/妹個數 父母與小孩個數 船票資訊 票價 客艙 登船港口
679 680 1 1 Cardeza, Mr. Thomas Drake Martinez male 36.0 0 1 PC 17755 512.3292 B51 B53 B55 C
258 259 1 1 Ward, Miss. Anna female 35.0 0 0 PC 17755 512.3292 NaN C
737 738 1 1 Lesurer, Mr. Gustave J male 35.0 0 0 PC 17755 512.3292 B101 C
438 439 0 1 Fortune, Mr. Mark male 64.0 1 4 19950 263.0000 C23 C25 C27 S
341 342 1 1 Fortune, Miss. Alice Elizabeth female 24.0 3 2 19950 263.0000 C23 C25 C27 S
... ... ... ... ... ... ... ... ... ... ... ... ...
481 482 0 2 Frost, Mr. Anthony Wood "Archie" male NaN 0 0 239854 0.0000 NaN S
633 634 0 1 Parr, Mr. William Henry Marsh male NaN 0 0 112052 0.0000 NaN S
674 675 0 2 Watson, Mr. Ennis Hastings male NaN 0 0 239856 0.0000 NaN S
732 733 0 2 Knight, Mr. Robert J male NaN 0 0 239855 0.0000 NaN S
815 816 0 1 Fry, Mr. Richard male NaN 0 0 112058 0.0000 B102 S

891 rows × 12 columns

x = pd.DataFrame(np.random.randn(4,4),
                      index=list('3241'),
                     columns=list('BDCA'))
x
B D C A
3 0.207853 0.574396 0.086197 1.187164
2 -0.794598 -0.308106 -0.291090 -0.150375
4 0.215895 -0.189428 0.556125 -0.361963
1 -1.593097 -0.205176 1.427471 -0.339048
y = pd.DataFrame(np.random.randn(5,4),
                      index=list('53241'),
                     columns=list('BDCE'))
y
B D C E
5 0.345871 -1.007209 -0.047450 -0.867648
3 -1.674230 0.874186 1.965207 -0.291205
2 1.802405 -0.033730 1.164470 -1.460408
4 0.501799 0.024054 -2.140898 -0.611685
1 -1.486755 1.780159 -2.015993 0.259121
x+y#只有行和列都相同的才能相加
A B C D E
1 NaN -3.079853 -0.588522 1.574983 NaN
2 NaN 1.007806 0.873381 -0.341837 NaN
3 NaN -1.466377 2.051404 1.448582 NaN
4 NaN 0.717694 -1.584773 -0.165374 NaN
5 NaN NaN NaN NaN NaN
max(df['堂兄弟/妹個數']+df['父母與小孩個數'])#取最大值
10
df.describe()
乘客ID 是否倖存 乘客等級(1/2/3等艙位) 年齡 堂兄弟/妹個數 父母與小孩個數 票價
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
from matplotlib import pyplot as plt
plt.hist(df['年齡'])#畫年齡的直方圖
(array([ 54.,  46., 177., 169., 118.,  70.,  45.,  24.,   9.,   2.]),
 array([ 0.42 ,  8.378, 16.336, 24.294, 32.252, 40.21 , 48.168, 56.126,
        64.084, 72.042, 80.   ]),
 <BarContainer object of 10 artists>)

img

df['票價'].describe()#票價基本資訊
count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: 票價, dtype: float64
plt.hist(df['票價'])#票價的直方圖
(array([732., 106.,  31.,   2.,  11.,   6.,   0.,   0.,   0.,   3.]),
 array([  0.     ,  51.23292, 102.46584, 153.69876, 204.93168, 256.1646 ,
        307.39752, 358.63044, 409.86336, 461.09628, 512.3292 ]),
 <BarContainer object of 10 artists>)

img

df['父母與小孩個數'].describe()#父母與小孩個數基本資訊
count    891.000000
mean       0.381594
std        0.806057
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: 父母與小孩個數, dtype: float64
plt.hist(df['父母與小孩個數'])#父母與小孩個數的直方圖
(array([678., 118.,   0.,  80.,   0.,   5.,   4.,   0.,   5.,   1.]),
 array([0. , 0.6, 1.2, 1.8, 2.4, 3. , 3.6, 4.2, 4.8, 5.4, 6. ]),
 <BarContainer object of 10 artists>)

img

相關文章