1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
| import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns
# データセットの読み込み train_data = pd.read_csv('/kaggle/input/titanic/train.csv') test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
# train_dataとtest_dataの連結 test_data['Survived'] = np.nan df = pd.concat([train_data, test_data], ignore_index=True, sort=False)
# ------------ Age ------------ # Age を Pclass, Sex, Parch, SibSp からランダムフォレストで推定 from sklearn.ensemble import RandomForestRegressor
# 推定に使用する項目を指定 age_df = df[['Age', 'Pclass','Sex','Parch','SibSp']]
# ラベル特徴量をワンホットエンコーディング age_df=pd.get_dummies(age_df)
# 学習データとテストデータに分離し、numpyに変換 known_age = age_df[age_df.Age.notnull()].values unknown_age = age_df[age_df.Age.isnull()].values
# 学習データをX, yに分離 X = known_age[:, 1:] y = known_age[:, 0]
# ランダムフォレストで推定モデルを構築 rfr = RandomForestRegressor(random_state=0, n_estimators=100, n_jobs=-1) rfr.fit(X, y)
# 推定モデルを使って、テストデータのAgeを予測し、補完 predictedAges = rfr.predict(unknown_age[:, 1::]) df.loc[(df.Age.isnull()), 'Age'] = predictedAges
# ------------ Name -------------- # Nameから敬称(Title)を抽出し、グルーピング df['Title'] = df['Name'].map(lambda x: x.split(', ')[1].split('. ')[0]) df['Title'].replace(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer', inplace=True) df['Title'].replace(['Don', 'Sir', 'the Countess', 'Lady', 'Dona'], 'Royalty', inplace=True) df['Title'].replace(['Mme', 'Ms'], 'Mrs', inplace=True) df['Title'].replace(['Mlle'], 'Miss', inplace=True) df['Title'].replace(['Jonkheer'], 'Master', inplace=True)
# ------------ Surname ------------ # NameからSurname(苗字)を抽出 df['Surname'] = df['Name'].map(lambda name:name.split(',')[0].strip())
# 同じSurname(苗字)の出現頻度をカウント(出現回数が2以上なら家族) df['FamilyGroup'] = df['Surname'].map(df['Surname'].value_counts())
# 家族で16才以下または女性の生存率 Female_Child_Group=df.loc[(df['FamilyGroup']>=2) & ((df['Age']<=16) | (df['Sex']=='female'))] Female_Child_Group=Female_Child_Group.groupby('Surname')['Survived'].mean()
# 家族で16才超えかつ男性の生存率 Male_Adult_Group=df.loc[(df['FamilyGroup']>=2) & (df['Age']>16) & (df['Sex']=='male')] Male_Adult_List=Male_Adult_Group.groupby('Surname')['Survived'].mean()
# デッドリストとサバイブリストの作成 Dead_list=set(Female_Child_Group[Female_Child_Group.apply(lambda x:x==0)].index) Survived_list=set(Male_Adult_List[Male_Adult_List.apply(lambda x:x==1)].index)
# デッドリストとサバイブリストをSex, Age, Title に反映させる df.loc[(df['Survived'].isnull()) & (df['Surname'].apply(lambda x:x in Dead_list)),\ ['Sex','Age','Title']] = ['male',28.0,'Mr'] df.loc[(df['Survived'].isnull()) & (df['Surname'].apply(lambda x:x in Survived_list)),\ ['Sex','Age','Title']] = ['female',5.0,'Mrs']
# ----------- Fare ------------- # 欠損値を Embarked='S', Pclass=3 の平均値で補完 fare=df.loc[(df['Embarked'] == 'S') & (df['Pclass'] == 3), 'Fare'].median() df['Fare']=df['Fare'].fillna(fare)
# ----------- Family ------------- # Family = SibSp + Parch + 1 を特徴量とし、グルーピング df['Family']=df['SibSp']+df['Parch']+1 df.loc[(df['Family']>=2) & (df['Family']<=4), 'Family_label'] = 2 df.loc[(df['Family']>=5) & (df['Family']<=7) | (df['Family']==1), 'Family_label'] = 1 # == に注意 df.loc[(df['Family']>=8), 'Family_label'] = 0
# ----------- Ticket ---------------- # 同一Ticketナンバーの人が何人いるかを特徴量として抽出 Ticket_Count = dict(df['Ticket'].value_counts()) df['TicketGroup'] = df['Ticket'].map(Ticket_Count)
# 生存率で3つにグルーピング df.loc[(df['TicketGroup']>=2) & (df['TicketGroup']<=4), 'Ticket_label'] = 2 df.loc[(df['TicketGroup']>=5) & (df['TicketGroup']<=8) | (df['TicketGroup']==1), 'Ticket_label'] = 1 df.loc[(df['TicketGroup']>=11), 'Ticket_label'] = 0
# ------------- Cabin ---------------- # Cabinの先頭文字を特徴量とする(欠損値は U ) df['Cabin'] = df['Cabin'].fillna('Unknown') df['Cabin_label']=df['Cabin'].str.get(0)
# ---------- Embarked --------------- # 欠損値をSで補完 df['Embarked'] = df['Embarked'].fillna('S')
# ------------- 前処理 --------------- # 推定に使用する項目を指定 df = df[['Survived','Pclass','Sex','Age','Fare','Embarked','Title','Family_label','Cabin_label','Ticket_label']]
# ラベル特徴量をワンホットエンコーディング df = pd.get_dummies(df)
# データセットを trainとtestに分割 train = df[df['Survived'].notnull()] test = df[df['Survived'].isnull()].drop('Survived',axis=1)
# データフレームをnumpyに変換 X = train.values[:,1:] y = train.values[:,0] test_x = test.values
# ----------- 推定モデル構築 --------------- from sklearn.feature_selection import SelectKBest from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_validate
# 採用する特徴量を25個から20個に絞り込む select = SelectKBest(k = 20)
clf = RandomForestClassifier(random_state = 10, warm_start = True, n_estimators = 26, max_depth = 6, max_features = 'sqrt') # clf = RandomForestClassifier() pipeline = make_pipeline(select, clf) pipeline.fit(X, y)
# ----- Submit dataの作成 ------- PassengerId=test_data['PassengerId'] predictions = pipeline.predict(test_x) submission = pd.DataFrame({"PassengerId": PassengerId, "Survived": predictions.astype(np.int32)}) submission.to_csv("result0322.csv", index=False)
|