[{"data":1,"prerenderedAt":1773},["ShallowReactive",2],{"content-\u002Fcontents\u002Fml_titanic_part4":3,"surroundPost-\u002Fcontents\u002Fml_titanic_part4":1764},{"id":4,"title":5,"body":6,"createdAt":1753,"description":147,"draft":1754,"extension":1755,"meta":1756,"navigation":169,"path":1757,"seo":1758,"stem":1759,"tags":1760,"thumbnail":1762,"updatedAt":1753,"__hash__":1763},"contents\u002Fcontents\u002Fml_titanic_part4.md","【機械学習】初心者がKaggleのtitanicで勉強してみた(モデル評価編)",{"type":7,"value":8,"toc":1742},"minimark",[9,13,17,20,43,46,49,61,68,75,78,81,83,86,89,98,101,108,115,118,129,141,218,223,233,242,245,251,254,257,271,325,328,331,338,362,365,371,374,400,403,406,409,417,553,556,559,566,577,587,590,663,666,669,672,682,745,748,751,754,763,766,769,812,815,818,821,903,906,909,912,915,1099,1102,1105,1108,1111,1216,1219,1222,1376,1379,1382,1385,1388,1391,1402,1405,1540,1543,1546,1549,1569,1572,1575,1578,1581,1584,1587,1591,1594,1605,1653,1656,1659,1662,1670,1674,1677,1680,1683,1686,1689,1692,1695,1698,1701,1704,1707,1710,1713,1716,1719,1722,1738],[10,11,12],"h2",{"id":12},"はじめに",[14,15,16],"p",{},"初心者が Kaggle の Titanic をやってみた第 4 回目モデル評価編やります！",[14,18,19],{},"（↓ これまでの記事）",[21,22,23,31,37],"ul",{},[24,25,26],"li",{},[27,28,30],"a",{"href":29},"ml_titanic","【機械学習】初心者が Kaggle の titanic で勉強してみた",[24,32,33],{},[27,34,36],{"href":35},"ml_titanic_part2","【機械学習】初心者が Kaggle の titanic で勉強してみた(前処理編)",[24,38,39],{},[27,40,42],{"href":41},"ml_titanic_part3","【機械学習】初心者が Kaggle の titanic で勉強してみた(アルゴリズム選定編)",[14,44,45],{},"このシリーズはこれでラストです！！",[14,47,48],{},"よーーーーやく終わります。",[14,50,51,52,56,57,60],{},"前回は、いくつかのアルゴリズムを検証データで評価し、",[53,54,55],"strong",{},"ロジスティック回帰","と",[53,58,59],{},"ランダムフォレスト","にアルゴリズムを絞りました。",[14,62,63,64,67],{},"さらに、この二つのアルゴリズムに対して",[53,65,66],{},"ハイパーパラメータチューニング","を行い、性能の向上を試みました。\nしかし、結果はそんなによくなかった。。。",[14,69,70,71,74],{},"そこで、今回は",[53,72,73],{},"特徴量エンジニアリング","を行なって、モデルの精度を向上させたいと思います！",[14,76,77],{},"やっていくなかで、これまでのコードがいろいろよくないアプローチをしていたことに気づいたので、それらはいったん忘れてください。。",[14,79,80],{},"これを読み終わったあとに前のやつを見返すと無駄なことをやってるいことに気づくと思います。",[10,82,73],{"id":73},[14,84,85],{},"特徴量エンジニアリングは、モデルの精度向上のために、データから特徴量を選択・加工・生成する作業だと自分は考えています。",[14,87,88],{},"参考書籍の「Python ではじめる機械学習」では特徴量エンジニアリングは",[90,91,92,95],"blockquote",{},[14,93,94],{},"「特定のアプリケーションに対して、最良のデータ表現を模索すること」",[14,96,97],{},"出典：Andreas C. Muller、Sarah Guido 著、中田 秀基 訳「Python ではじめる機械学習」",[14,99,100],{},"とあります。",[14,102,103,104,107],{},"私も今回いろいろ試してみてこの",[53,105,106],{},"最良のデータ表現の模索","というのがぴったりだなと思っています。",[14,109,110,111,114],{},"それでは、どのようにこの",[53,112,113],{},"よいデータ表現","を見つければいいのか。",[14,116,117],{},"私が現在考えてるアプローチは以下のものです。",[21,119,120,123,126],{},[24,121,122],{},"数値データをビニングしてカテゴライズ",[24,124,125],{},"数値データの正規化 or 標準化",[24,127,128],{},"新しい特徴量の作成（問題に基づいて作成）",[14,130,131,132,136,137,140],{},"今回の titanic のような分類問題の場合、まず",[133,134,135],"code",{},"1","のアプローチをやってみる。そのために、数値データである",[133,138,139],{},"Fare","のヒストグラムをもう一度確認してみる。",[142,143,148],"pre",{"className":144,"code":145,"language":146,"meta":147,"style":147},"language-py shiki shiki-themes github-dark","import pandas as pd\nimport matplotlib.pyplot as plt\n\nTRAIN_DATA_FILE = \"train.csv\"\ntrain_data = pd.read_csv(TRAIN_DATA_FILE)\ny = train_data[\"Survived\"]\n\ntrain_data[\"Fare\"].hist(bins=50, figsize=(10,5))\nplt.title(\"Fare histgram\")\nplt.xlabel(\"Fare\")\nplt.show()\n","py","",[133,149,150,158,164,171,177,183,189,194,200,206,212],{"__ignoreMap":147},[151,152,155],"span",{"class":153,"line":154},"line",1,[151,156,157],{},"import pandas as pd\n",[151,159,161],{"class":153,"line":160},2,[151,162,163],{},"import matplotlib.pyplot as plt\n",[151,165,167],{"class":153,"line":166},3,[151,168,170],{"emptyLinePlaceholder":169},true,"\n",[151,172,174],{"class":153,"line":173},4,[151,175,176],{},"TRAIN_DATA_FILE = \"train.csv\"\n",[151,178,180],{"class":153,"line":179},5,[151,181,182],{},"train_data = pd.read_csv(TRAIN_DATA_FILE)\n",[151,184,186],{"class":153,"line":185},6,[151,187,188],{},"y = train_data[\"Survived\"]\n",[151,190,192],{"class":153,"line":191},7,[151,193,170],{"emptyLinePlaceholder":169},[151,195,197],{"class":153,"line":196},8,[151,198,199],{},"train_data[\"Fare\"].hist(bins=50, figsize=(10,5))\n",[151,201,203],{"class":153,"line":202},9,[151,204,205],{},"plt.title(\"Fare histgram\")\n",[151,207,209],{"class":153,"line":208},10,[151,210,211],{},"plt.xlabel(\"Fare\")\n",[151,213,215],{"class":153,"line":214},11,[151,216,217],{},"plt.show()\n",[219,220],"img",{"alt":221,"img-src":222},"output","\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_25.png",[14,224,225,226,229,230,232],{},"0〜500 までの広い値をとることがわかる。\nさらに以下の",[133,227,228],{},"Survived={0, 1}","が",[133,231,139],{},"の値とどのように対応しているのかカウントして平均をとった情報をみてみる。",[142,234,236],{"className":144,"code":235,"language":146,"meta":147,"style":147},"print (train_data[[\"Fare\", \"Survived\"]].groupby(['Fare'], as_index=False).mean().head(10))\n",[133,237,238],{"__ignoreMap":147},[151,239,240],{"class":153,"line":154},[151,241,235],{},[219,243],{"alt":221,"img-src":244},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_26.png",[14,246,247,248,250],{},"ほとんどの",[133,249,139],{},"の値で十分なデータがないことがわかる。",[14,252,253],{},"というより、こんなに細かく値を区切って生存したかどうかを判断するかはおかしい。",[14,255,256],{},"おそらく、料金と生存者の相関はあるだろう。しかし、ここまで細かい値の違いで傾向は抽出できないはず。",[14,258,259,260,270],{},"そこで、ある程度値をまとめあげてカテゴライズすることを考える。\n",[261,262,263,264],"del",{},"（参考：",[27,265,269],{"href":266,"rel":267},"https:\u002F\u002Fwww.kaggle.com\u002Fsinakhorami\u002Ftitanic-best-working-classifier%EF%BC%89",[268],"nofollow","https:\u002F\u002Fwww.kaggle.com\u002Fsinakhorami\u002Ftitanic-best-working-classifier）","\nリンクが切れてました（2019\u002F08\u002F22 　追記）",[142,272,274],{"className":144,"code":273,"language":146,"meta":147,"style":147},"def FE_fare(data):\n    # qcutで各ビンに含まれる個数が等しくなるようにビン分割する\n    data['CategoricalFare'] = pd.qcut(data['Fare'], 4)\n    data.loc[data['Fare'] \u003C= 7.91, 'Fare']                         = 0\n    data.loc[(data['Fare'] > 7.91) & (data['Fare'] \u003C= 14.454), 'Fare'] = 1\n    data.loc[(data['Fare'] > 14.454) & (data['Fare'] \u003C= 31), 'Fare']   = 2\n    data.loc[ data['Fare'] > 31, 'Fare']        =3\n\nFE_fare(train_data)\nprint (train_data[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean().head())\n",[133,275,276,281,286,291,296,301,306,311,315,320],{"__ignoreMap":147},[151,277,278],{"class":153,"line":154},[151,279,280],{},"def FE_fare(data):\n",[151,282,283],{"class":153,"line":160},[151,284,285],{},"    # qcutで各ビンに含まれる個数が等しくなるようにビン分割する\n",[151,287,288],{"class":153,"line":166},[151,289,290],{},"    data['CategoricalFare'] = pd.qcut(data['Fare'], 4)\n",[151,292,293],{"class":153,"line":173},[151,294,295],{},"    data.loc[data['Fare'] \u003C= 7.91, 'Fare']                         = 0\n",[151,297,298],{"class":153,"line":179},[151,299,300],{},"    data.loc[(data['Fare'] > 7.91) & (data['Fare'] \u003C= 14.454), 'Fare'] = 1\n",[151,302,303],{"class":153,"line":185},[151,304,305],{},"    data.loc[(data['Fare'] > 14.454) & (data['Fare'] \u003C= 31), 'Fare']   = 2\n",[151,307,308],{"class":153,"line":191},[151,309,310],{},"    data.loc[ data['Fare'] > 31, 'Fare']        =3\n",[151,312,313],{"class":153,"line":196},[151,314,170],{"emptyLinePlaceholder":169},[151,316,317],{"class":153,"line":202},[151,318,319],{},"FE_fare(train_data)\n",[151,321,322],{"class":153,"line":208},[151,323,324],{},"print (train_data[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean().head())\n",[219,326],{"alt":221,"img-src":327},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_27.png",[14,329,330],{},"明らかに料金が高い方が生存の割合が高いのがわかる。",[14,332,333,334,337],{},"同様に",[133,335,336],{},"Age","のデータも処理していく。\nまず、ヒストグラムは以下の通り。",[142,339,341],{"className":144,"code":340,"language":146,"meta":147,"style":147},"train_data[\"Age\"].hist(bins=50, figsize=(10,5))\nplt.title(\"Age histgram\")\nplt.xlabel(\"Age\")\nplt.show()\n",[133,342,343,348,353,358],{"__ignoreMap":147},[151,344,345],{"class":153,"line":154},[151,346,347],{},"train_data[\"Age\"].hist(bins=50, figsize=(10,5))\n",[151,349,350],{"class":153,"line":160},[151,351,352],{},"plt.title(\"Age histgram\")\n",[151,354,355],{"class":153,"line":166},[151,356,357],{},"plt.xlabel(\"Age\")\n",[151,359,360],{"class":153,"line":173},[151,361,217],{},[219,363],{"alt":221,"img-src":364},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_23.png",[14,366,367,368,370],{},"0〜80 歳と広い値をとりうることがわかる。また、前処理編でみたように、",[133,369,336],{},"データは欠損値があるのでそれを補填する必要がある。",[14,372,373],{},"ただし、以前の中央値で欠損をうめるのはよくない。\n補填後の以下の分布をみればわかる。",[142,375,377],{"className":144,"code":376,"language":146,"meta":147,"style":147},"train_data[\"Age\"] = train_data[\"Age\"].fillna(train_data[\"Age\"].median())\ntrain_data[\"Age\"].hist(bins=50, figsize=(10,5))\nplt.title(\"Age histgram\")\nplt.xlabel(\"Age\")\nplt.show()\n",[133,378,379,384,388,392,396],{"__ignoreMap":147},[151,380,381],{"class":153,"line":154},[151,382,383],{},"train_data[\"Age\"] = train_data[\"Age\"].fillna(train_data[\"Age\"].median())\n",[151,385,386],{"class":153,"line":160},[151,387,347],{},[151,389,390],{"class":153,"line":166},[151,391,352],{},[151,393,394],{"class":153,"line":173},[151,395,357],{},[151,397,398],{"class":153,"line":179},[151,399,217],{},[219,401],{"alt":221,"img-src":402},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_24.png",[14,404,405],{},"見てのとおり補填した中央値だけ突き抜けてしまう。",[14,407,408],{},"なので、データの分布から値を生成して欠損値をうめる。このアプローチも上と同様に以下のコードを参考にさせていただきました。",[14,410,411,416],{},[261,412,263,413],{},[27,414,269],{"href":266,"rel":415},[268],"\nリンク切れでした（2019\u002F08\u002F22 追記）",[142,418,420],{"className":144,"code":419,"language":146,"meta":147,"style":147},"def FE_age(data):\n    age_avg        = data['Age'].mean()\n    age_std        = data['Age'].std()\n    age_null_count = data['Age'].isnull().sum()\n\n    ## 平均値と平均標準偏差の間の値からからランダムに欠損分だけ選択\n    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)\n    data['Age'][np.isnan(data['Age'])] = age_null_random_list\n    data['Age'] = data['Age'].astype(int)\n\n    data['CategoricalAge'] = pd.cut(data['Age'], 5)\n\n    # Mapping Age\n    data.loc[data['Age'] \u003C= 16, 'Age']                         = 0\n    data.loc[(data['Age'] > 16) & (data['Age'] \u003C= 32), 'Age'] = 1\n    data.loc[(data['Age'] > 32) & (data['Age'] \u003C= 48), 'Age'] = 2\n    data.loc[(data['Age'] > 48) & (data['Age'] \u003C= 64), 'Age'] = 3\n    data.loc[ data['Age'] > 64, 'Age']  = 4\n\nFE_age(train_data)\ntrain_data[\"Age\"].hist(bins=50, figsize=(10,5))\nplt.title(\"Age histgram\")\nplt.xlabel(\"Age\")\nplt.show()\nprint (train_data[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean().head())\n",[133,421,422,427,432,437,442,446,451,456,461,466,470,475,480,486,492,498,504,510,516,521,527,532,537,542,547],{"__ignoreMap":147},[151,423,424],{"class":153,"line":154},[151,425,426],{},"def FE_age(data):\n",[151,428,429],{"class":153,"line":160},[151,430,431],{},"    age_avg        = data['Age'].mean()\n",[151,433,434],{"class":153,"line":166},[151,435,436],{},"    age_std        = data['Age'].std()\n",[151,438,439],{"class":153,"line":173},[151,440,441],{},"    age_null_count = data['Age'].isnull().sum()\n",[151,443,444],{"class":153,"line":179},[151,445,170],{"emptyLinePlaceholder":169},[151,447,448],{"class":153,"line":185},[151,449,450],{},"    ## 平均値と平均標準偏差の間の値からからランダムに欠損分だけ選択\n",[151,452,453],{"class":153,"line":191},[151,454,455],{},"    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)\n",[151,457,458],{"class":153,"line":196},[151,459,460],{},"    data['Age'][np.isnan(data['Age'])] = age_null_random_list\n",[151,462,463],{"class":153,"line":202},[151,464,465],{},"    data['Age'] = data['Age'].astype(int)\n",[151,467,468],{"class":153,"line":208},[151,469,170],{"emptyLinePlaceholder":169},[151,471,472],{"class":153,"line":214},[151,473,474],{},"    data['CategoricalAge'] = pd.cut(data['Age'], 5)\n",[151,476,478],{"class":153,"line":477},12,[151,479,170],{"emptyLinePlaceholder":169},[151,481,483],{"class":153,"line":482},13,[151,484,485],{},"    # Mapping Age\n",[151,487,489],{"class":153,"line":488},14,[151,490,491],{},"    data.loc[data['Age'] \u003C= 16, 'Age']                         = 0\n",[151,493,495],{"class":153,"line":494},15,[151,496,497],{},"    data.loc[(data['Age'] > 16) & (data['Age'] \u003C= 32), 'Age'] = 1\n",[151,499,501],{"class":153,"line":500},16,[151,502,503],{},"    data.loc[(data['Age'] > 32) & (data['Age'] \u003C= 48), 'Age'] = 2\n",[151,505,507],{"class":153,"line":506},17,[151,508,509],{},"    data.loc[(data['Age'] > 48) & (data['Age'] \u003C= 64), 'Age'] = 3\n",[151,511,513],{"class":153,"line":512},18,[151,514,515],{},"    data.loc[ data['Age'] > 64, 'Age']  = 4\n",[151,517,519],{"class":153,"line":518},19,[151,520,170],{"emptyLinePlaceholder":169},[151,522,524],{"class":153,"line":523},20,[151,525,526],{},"FE_age(train_data)\n",[151,528,530],{"class":153,"line":529},21,[151,531,347],{},[151,533,535],{"class":153,"line":534},22,[151,536,352],{},[151,538,540],{"class":153,"line":539},23,[151,541,357],{},[151,543,545],{"class":153,"line":544},24,[151,546,217],{},[151,548,550],{"class":153,"line":549},25,[151,551,552],{},"print (train_data[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean().head())\n",[219,554],{"alt":221,"img-src":555},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_28.png",[14,557,558],{},"おー明らかに子供の生存率が高くて老人の生存率が低いことがわかる。\n特徴とよべるものがよくでてる。",[14,560,561,562,565],{},"数値データに関する特徴量エンジニアリングはこれで終わり。\n最初よりデータが",[53,563,564],{},"よい表現","になった気がしてる。",[14,567,568,569,572,573,576],{},"続いて、",[133,570,571],{},"3","のアプローチをやっていく。\n今回は",[133,574,575],{},"2","のアプローチはやりません（おそらく必要ない）",[14,578,579,580,56,583,586],{},"新しく特徴量を作るということで、",[133,581,582],{},"SibSp",[133,584,585],{},"Parch","から家族人数という変数を作る。\nこの特徴量を作る理由は、いろいろなサイトで作成しているのを見たから。",[14,588,589],{},"というのもあるが、生存者予測という課題なら同乗している家族人数が影響しそうなファクターであることは想像できると思う。\n（こんな感じで特徴量にあたりをつけて作れるのが一流のデータサイエンティストなのかな）",[142,591,593],{"className":144,"code":592,"language":146,"meta":147,"style":147},"def FE_family_size(data):\n    # 家族サイズ\n    family_size = []\n    for a_tmp, b_tmp in zip(data[\"SibSp\"], data['Parch']):\n        num_family = a_tmp + b_tmp + 1\n        if num_family > 4:\n            num_family = 5\n            family_size.append(num_family)\n        else:\n            family_size.append(num_family)\n    data[\"family_size\"]  = family_size\n\nFE_family_size(train_data)\nprint (train_data[[\"family_size\", \"Survived\"]].groupby([\"family_size\"], as_index=False).mean().head(10))\n",[133,594,595,600,605,610,615,620,625,630,635,640,644,649,653,658],{"__ignoreMap":147},[151,596,597],{"class":153,"line":154},[151,598,599],{},"def FE_family_size(data):\n",[151,601,602],{"class":153,"line":160},[151,603,604],{},"    # 家族サイズ\n",[151,606,607],{"class":153,"line":166},[151,608,609],{},"    family_size = []\n",[151,611,612],{"class":153,"line":173},[151,613,614],{},"    for a_tmp, b_tmp in zip(data[\"SibSp\"], data['Parch']):\n",[151,616,617],{"class":153,"line":179},[151,618,619],{},"        num_family = a_tmp + b_tmp + 1\n",[151,621,622],{"class":153,"line":185},[151,623,624],{},"        if num_family > 4:\n",[151,626,627],{"class":153,"line":191},[151,628,629],{},"            num_family = 5\n",[151,631,632],{"class":153,"line":196},[151,633,634],{},"            family_size.append(num_family)\n",[151,636,637],{"class":153,"line":202},[151,638,639],{},"        else:\n",[151,641,642],{"class":153,"line":208},[151,643,634],{},[151,645,646],{"class":153,"line":214},[151,647,648],{},"    data[\"family_size\"]  = family_size\n",[151,650,651],{"class":153,"line":477},[151,652,170],{"emptyLinePlaceholder":169},[151,654,655],{"class":153,"line":482},[151,656,657],{},"FE_family_size(train_data)\n",[151,659,660],{"class":153,"line":488},[151,661,662],{},"print (train_data[[\"family_size\", \"Survived\"]].groupby([\"family_size\"], as_index=False).mean().head(10))\n",[219,664],{"alt":221,"img-src":665},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_29.png",[14,667,668],{},"これもいい特徴になっている気する。",[14,670,671],{},"家族人数が多い方が生存者割合が低いことがわかる（5 人以上のサンプルが少ない可能性がありそう）",[14,673,674,675,56,678,681],{},"生成は以上。あとは",[133,676,677],{},"Embarked",[133,679,680],{},"Sex","のカテゴリを整数値にする（今回はカテゴリ変数にしない）。",[142,683,685],{"className":144,"code":684,"language":146,"meta":147,"style":147},"def FE_Embarked(data):\n    data = data.fillna({'Embarked': 'S'}, inplace = True)\n    data['Embarked'] = data['Embarked'].map( {'S': 1, 'C': 2, 'Q': 3} ).astype(int)\n\nFE_Embarked(train_data)\nprint (train_data[[\"Embarked\", \"Survived\"]].groupby([\"Embarked\"], as_index=False).mean().head(10))\n\ndef FE_sex_onehot(data):\n    ## 性別を1カラムに\n    data.loc[data[\"Sex\"] == \"female\", 'Sex'] = 0\n    data.loc[data[\"Sex\"] == \"male\", 'Sex'] = 1\nFE_sex_onehot(train_data)\n",[133,686,687,692,697,702,706,711,716,720,725,730,735,740],{"__ignoreMap":147},[151,688,689],{"class":153,"line":154},[151,690,691],{},"def FE_Embarked(data):\n",[151,693,694],{"class":153,"line":160},[151,695,696],{},"    data = data.fillna({'Embarked': 'S'}, inplace = True)\n",[151,698,699],{"class":153,"line":166},[151,700,701],{},"    data['Embarked'] = data['Embarked'].map( {'S': 1, 'C': 2, 'Q': 3} ).astype(int)\n",[151,703,704],{"class":153,"line":173},[151,705,170],{"emptyLinePlaceholder":169},[151,707,708],{"class":153,"line":179},[151,709,710],{},"FE_Embarked(train_data)\n",[151,712,713],{"class":153,"line":185},[151,714,715],{},"print (train_data[[\"Embarked\", \"Survived\"]].groupby([\"Embarked\"], as_index=False).mean().head(10))\n",[151,717,718],{"class":153,"line":191},[151,719,170],{"emptyLinePlaceholder":169},[151,721,722],{"class":153,"line":196},[151,723,724],{},"def FE_sex_onehot(data):\n",[151,726,727],{"class":153,"line":202},[151,728,729],{},"    ## 性別を1カラムに\n",[151,731,732],{"class":153,"line":208},[151,733,734],{},"    data.loc[data[\"Sex\"] == \"female\", 'Sex'] = 0\n",[151,736,737],{"class":153,"line":214},[151,738,739],{},"    data.loc[data[\"Sex\"] == \"male\", 'Sex'] = 1\n",[151,741,742],{"class":153,"line":477},[151,743,744],{},"FE_sex_onehot(train_data)\n",[219,746],{"alt":221,"img-src":747},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_30.png",[219,749],{"alt":221,"img-src":750},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_31.png",[14,752,753],{},"ここまでやると次のような特徴量が出来上がっているはず",[142,755,757],{"className":144,"code":756,"language":146,"meta":147,"style":147},"train_data.info()\n",[133,758,759],{"__ignoreMap":147},[151,760,761],{"class":153,"line":154},[151,762,756],{},[219,764],{"alt":221,"img-src":765},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_32.png",[14,767,768],{},"次はいらない列をおとす。",[142,770,772],{"className":144,"code":771,"language":146,"meta":147,"style":147},"train_data = train_data.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)\ndrop_train_list = ['PassengerId', \"Name\", 'Ticket', 'Cabin', 'Survived']\n\ndef drop_columns(drop_list, data):\n    data = data.drop(drop_list, axis=1)\n    return data\n\ntrain_data = drop_columns(drop_train_list, train_data)\n",[133,773,774,779,784,788,793,798,803,807],{"__ignoreMap":147},[151,775,776],{"class":153,"line":154},[151,777,778],{},"train_data = train_data.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)\n",[151,780,781],{"class":153,"line":160},[151,782,783],{},"drop_train_list = ['PassengerId', \"Name\", 'Ticket', 'Cabin', 'Survived']\n",[151,785,786],{"class":153,"line":166},[151,787,170],{"emptyLinePlaceholder":169},[151,789,790],{"class":153,"line":173},[151,791,792],{},"def drop_columns(drop_list, data):\n",[151,794,795],{"class":153,"line":179},[151,796,797],{},"    data = data.drop(drop_list, axis=1)\n",[151,799,800],{"class":153,"line":185},[151,801,802],{},"    return data\n",[151,804,805],{"class":153,"line":191},[151,806,170],{"emptyLinePlaceholder":169},[151,808,809],{"class":153,"line":196},[151,810,811],{},"train_data = drop_columns(drop_train_list, train_data)\n",[14,813,814],{},"これで特徴量エンジニアリングをふまえた前処理終了。",[10,816,817],{"id":817},"ランダムフォレストを用いた特徴量の重要度可視化",[14,819,820],{},"ランダムフォレストなどの決定木を用いたアルゴリズムはどの特徴量が最終的な決定に影響しているのか可視化することができる。\nこれで、重要そうな特徴量がわかる（あとで説明もしやすい）。",[142,822,824],{"className":144,"code":823,"language":146,"meta":147,"style":147},"from sklearn import model_selection, metrics\n\nX_train, X_test, y_train, y_test = model_selection.train_test_split(train_data, y, random_state=1)\n\n# ランダムフォレスト重要度\ndef measure_importance_rdf(X_train, y_train, X_test, y_test):\n    rf = RandomForestClassifier()\n    rf.fit(X_train, y_train)\n    features = pd.DataFrame()\n    features['feature'] = X_train.columns\n    features['importance'] = rf.feature_importances_\n    features.sort_values(by=['importance'], ascending=True, inplace=True)\n    features.set_index('feature', inplace=True)\n    features.plot(kind='barh', figsize=(10, 5))\n\nmeasure_importance_rdf(X_train, y_train, X_test, y_test)\n",[133,825,826,831,835,840,844,849,854,859,864,869,874,879,884,889,894,898],{"__ignoreMap":147},[151,827,828],{"class":153,"line":154},[151,829,830],{},"from sklearn import model_selection, metrics\n",[151,832,833],{"class":153,"line":160},[151,834,170],{"emptyLinePlaceholder":169},[151,836,837],{"class":153,"line":166},[151,838,839],{},"X_train, X_test, y_train, y_test = model_selection.train_test_split(train_data, y, random_state=1)\n",[151,841,842],{"class":153,"line":173},[151,843,170],{"emptyLinePlaceholder":169},[151,845,846],{"class":153,"line":179},[151,847,848],{},"# ランダムフォレスト重要度\n",[151,850,851],{"class":153,"line":185},[151,852,853],{},"def measure_importance_rdf(X_train, y_train, X_test, y_test):\n",[151,855,856],{"class":153,"line":191},[151,857,858],{},"    rf = RandomForestClassifier()\n",[151,860,861],{"class":153,"line":196},[151,862,863],{},"    rf.fit(X_train, y_train)\n",[151,865,866],{"class":153,"line":202},[151,867,868],{},"    features = pd.DataFrame()\n",[151,870,871],{"class":153,"line":208},[151,872,873],{},"    features['feature'] = X_train.columns\n",[151,875,876],{"class":153,"line":214},[151,877,878],{},"    features['importance'] = rf.feature_importances_\n",[151,880,881],{"class":153,"line":477},[151,882,883],{},"    features.sort_values(by=['importance'], ascending=True, inplace=True)\n",[151,885,886],{"class":153,"line":482},[151,887,888],{},"    features.set_index('feature', inplace=True)\n",[151,890,891],{"class":153,"line":488},[151,892,893],{},"    features.plot(kind='barh', figsize=(10, 5))\n",[151,895,896],{"class":153,"line":494},[151,897,170],{"emptyLinePlaceholder":169},[151,899,900],{"class":153,"line":500},[151,901,902],{},"measure_importance_rdf(X_train, y_train, X_test, y_test)\n",[219,904],{"alt":221,"img-src":905},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_33.png",[14,907,908],{},"グラフを見ると、性別と年齢が大事という結果に。",[14,910,911],{},"これは説明がつきそうな結果ですね。緊急時は、女性と子供と金持ちが優先されるということかな。",[14,913,914],{},"続いて、前回同様に各種アルゴリズムに投入してみる。",[142,916,918],{"className":144,"code":917,"language":146,"meta":147,"style":147},"from sklearn.linear_model import LogisticRegression\nfrom sklearn.neighbors import KNeighborsClassifier # K近傍法\nfrom sklearn.svm import SVC # サポートベクターマシン\nfrom sklearn.tree import DecisionTreeClassifier # 決定木\nfrom sklearn.ensemble import RandomForestClassifier # ランダムフォレスト\nfrom sklearn.ensemble import AdaBoostClassifier # AdaBoost\nfrom sklearn.naive_bayes import GaussianNB # ナイーブ・ベイズ\n\ndef test_algorithm(X_train, X_test, y_train, y_test, classifiers, names, version):\n    result = []\n\n    for clf, name in zip(classifiers, names):\n        clf.fit(X_train.values, y_train.values)\n        score1 = clf.score(X_train, y_train)\n        score2 = clf.score(X_test, y_test)\n        fname = name + \"_\" + version + \"{}\".format(\".pickle\")\n        result.append([score1, score2])\n        with open(fname, 'wb') as f:\n            pickle.dump(clf, f)\n\n    df_result = pd.DataFrame(result, columns=['train', 'test'], index = names)\n    print(df_result)\n\nclassifiers = [\n    LogisticRegression(),\n    KNeighborsClassifier(),\n    DecisionTreeClassifier(),\n    RandomForestClassifier(),\n    AdaBoostClassifier(),\n    GaussianNB(),\n]\n\nnames = [\"Logistic Regression\", \"Nearest Neighbors\", \"Decision Tree\",\"Random Forest\", \"AdaBoost\", \"Naive Bayes\"]\n\ntest_algorithm(X_train, X_test, y_train, y_test, classifiers, names, version=\"4\")\n",[133,919,920,925,930,935,940,945,950,955,959,964,969,973,978,983,988,993,998,1003,1008,1013,1017,1022,1027,1031,1036,1041,1047,1053,1059,1065,1071,1077,1082,1088,1093],{"__ignoreMap":147},[151,921,922],{"class":153,"line":154},[151,923,924],{},"from sklearn.linear_model import LogisticRegression\n",[151,926,927],{"class":153,"line":160},[151,928,929],{},"from sklearn.neighbors import KNeighborsClassifier # K近傍法\n",[151,931,932],{"class":153,"line":166},[151,933,934],{},"from sklearn.svm import SVC # サポートベクターマシン\n",[151,936,937],{"class":153,"line":173},[151,938,939],{},"from sklearn.tree import DecisionTreeClassifier # 決定木\n",[151,941,942],{"class":153,"line":179},[151,943,944],{},"from sklearn.ensemble import RandomForestClassifier # ランダムフォレスト\n",[151,946,947],{"class":153,"line":185},[151,948,949],{},"from sklearn.ensemble import AdaBoostClassifier # AdaBoost\n",[151,951,952],{"class":153,"line":191},[151,953,954],{},"from sklearn.naive_bayes import GaussianNB # ナイーブ・ベイズ\n",[151,956,957],{"class":153,"line":196},[151,958,170],{"emptyLinePlaceholder":169},[151,960,961],{"class":153,"line":202},[151,962,963],{},"def test_algorithm(X_train, X_test, y_train, y_test, classifiers, names, version):\n",[151,965,966],{"class":153,"line":208},[151,967,968],{},"    result = []\n",[151,970,971],{"class":153,"line":214},[151,972,170],{"emptyLinePlaceholder":169},[151,974,975],{"class":153,"line":477},[151,976,977],{},"    for clf, name in zip(classifiers, names):\n",[151,979,980],{"class":153,"line":482},[151,981,982],{},"        clf.fit(X_train.values, y_train.values)\n",[151,984,985],{"class":153,"line":488},[151,986,987],{},"        score1 = clf.score(X_train, y_train)\n",[151,989,990],{"class":153,"line":494},[151,991,992],{},"        score2 = clf.score(X_test, y_test)\n",[151,994,995],{"class":153,"line":500},[151,996,997],{},"        fname = name + \"_\" + version + \"{}\".format(\".pickle\")\n",[151,999,1000],{"class":153,"line":506},[151,1001,1002],{},"        result.append([score1, score2])\n",[151,1004,1005],{"class":153,"line":512},[151,1006,1007],{},"        with open(fname, 'wb') as f:\n",[151,1009,1010],{"class":153,"line":518},[151,1011,1012],{},"            pickle.dump(clf, f)\n",[151,1014,1015],{"class":153,"line":523},[151,1016,170],{"emptyLinePlaceholder":169},[151,1018,1019],{"class":153,"line":529},[151,1020,1021],{},"    df_result = pd.DataFrame(result, columns=['train', 'test'], index = names)\n",[151,1023,1024],{"class":153,"line":534},[151,1025,1026],{},"    print(df_result)\n",[151,1028,1029],{"class":153,"line":539},[151,1030,170],{"emptyLinePlaceholder":169},[151,1032,1033],{"class":153,"line":544},[151,1034,1035],{},"classifiers = [\n",[151,1037,1038],{"class":153,"line":549},[151,1039,1040],{},"    LogisticRegression(),\n",[151,1042,1044],{"class":153,"line":1043},26,[151,1045,1046],{},"    KNeighborsClassifier(),\n",[151,1048,1050],{"class":153,"line":1049},27,[151,1051,1052],{},"    DecisionTreeClassifier(),\n",[151,1054,1056],{"class":153,"line":1055},28,[151,1057,1058],{},"    RandomForestClassifier(),\n",[151,1060,1062],{"class":153,"line":1061},29,[151,1063,1064],{},"    AdaBoostClassifier(),\n",[151,1066,1068],{"class":153,"line":1067},30,[151,1069,1070],{},"    GaussianNB(),\n",[151,1072,1074],{"class":153,"line":1073},31,[151,1075,1076],{},"]\n",[151,1078,1080],{"class":153,"line":1079},32,[151,1081,170],{"emptyLinePlaceholder":169},[151,1083,1085],{"class":153,"line":1084},33,[151,1086,1087],{},"names = [\"Logistic Regression\", \"Nearest Neighbors\", \"Decision Tree\",\"Random Forest\", \"AdaBoost\", \"Naive Bayes\"]\n",[151,1089,1091],{"class":153,"line":1090},34,[151,1092,170],{"emptyLinePlaceholder":169},[151,1094,1096],{"class":153,"line":1095},35,[151,1097,1098],{},"test_algorithm(X_train, X_test, y_train, y_test, classifiers, names, version=\"4\")\n",[219,1100],{"alt":221,"img-src":1101},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_34.png",[14,1103,1104],{},"ランダムフォレスト、決定木、ロジスティック回帰がいい性能をだしてることがわかる。",[14,1106,1107],{},"前回の記事の結果と比べると、全体的に少しだけ良性能が向上しているように見える。何回か試してみた感じでも、今回のほうが安定していた。",[14,1109,1110],{},"決定木は汎化性能をあげるのが難しそうなので、ランダムフォレストとロジスティック回帰のハイパーパラメータチューニングをすることにする。",[142,1112,1114],{"className":144,"code":1113,"language":146,"meta":147,"style":147},"version = \"3\"\nfname_lgr = \"LogisticRegression_with_best_param_{}\".format(version) + \"_{}\".format(\".pickle\")\n\ndef grid_search_lgr(X_train, y_train, X_test, y_test, fname):\n    # ロジスティック回帰のパラメータ\n    parameter={\"C\":[10**i for i in range(-2,4)],\"random_state\":[1, 2, 3],}\n    grid_search = GridSearchCV(LogisticRegression(), parameter, cv=5)\n    grid_search.fit(X_train, y_train)\n\n    print(\"Best parameters : {}\".format(grid_search.best_params_))\n    print(\"Best cross-validation score : {:.3f}\".format(grid_search.best_score_))\n\n    lgr = LogisticRegression(**grid_search.best_params_)\n    lgr.fit(X_train, y_train)\n\n    with open(fname, 'wb') as f:\n            pickle.dump(lgr, f)\n    test_score = lgr.score(X_test, y_test)\n    print(\"Score with best parameteras : {}\".format(test_score))\n\ngrid_search_lgr(X_train, y_train, X_test, y_test, fname_lgr)\n",[133,1115,1116,1121,1126,1130,1135,1140,1145,1150,1155,1159,1164,1169,1173,1178,1183,1187,1192,1197,1202,1207,1211],{"__ignoreMap":147},[151,1117,1118],{"class":153,"line":154},[151,1119,1120],{},"version = \"3\"\n",[151,1122,1123],{"class":153,"line":160},[151,1124,1125],{},"fname_lgr = \"LogisticRegression_with_best_param_{}\".format(version) + \"_{}\".format(\".pickle\")\n",[151,1127,1128],{"class":153,"line":166},[151,1129,170],{"emptyLinePlaceholder":169},[151,1131,1132],{"class":153,"line":173},[151,1133,1134],{},"def grid_search_lgr(X_train, y_train, X_test, y_test, fname):\n",[151,1136,1137],{"class":153,"line":179},[151,1138,1139],{},"    # ロジスティック回帰のパラメータ\n",[151,1141,1142],{"class":153,"line":185},[151,1143,1144],{},"    parameter={\"C\":[10**i for i in range(-2,4)],\"random_state\":[1, 2, 3],}\n",[151,1146,1147],{"class":153,"line":191},[151,1148,1149],{},"    grid_search = GridSearchCV(LogisticRegression(), parameter, cv=5)\n",[151,1151,1152],{"class":153,"line":196},[151,1153,1154],{},"    grid_search.fit(X_train, y_train)\n",[151,1156,1157],{"class":153,"line":202},[151,1158,170],{"emptyLinePlaceholder":169},[151,1160,1161],{"class":153,"line":208},[151,1162,1163],{},"    print(\"Best parameters : {}\".format(grid_search.best_params_))\n",[151,1165,1166],{"class":153,"line":214},[151,1167,1168],{},"    print(\"Best cross-validation score : {:.3f}\".format(grid_search.best_score_))\n",[151,1170,1171],{"class":153,"line":477},[151,1172,170],{"emptyLinePlaceholder":169},[151,1174,1175],{"class":153,"line":482},[151,1176,1177],{},"    lgr = LogisticRegression(**grid_search.best_params_)\n",[151,1179,1180],{"class":153,"line":488},[151,1181,1182],{},"    lgr.fit(X_train, y_train)\n",[151,1184,1185],{"class":153,"line":494},[151,1186,170],{"emptyLinePlaceholder":169},[151,1188,1189],{"class":153,"line":500},[151,1190,1191],{},"    with open(fname, 'wb') as f:\n",[151,1193,1194],{"class":153,"line":506},[151,1195,1196],{},"            pickle.dump(lgr, f)\n",[151,1198,1199],{"class":153,"line":512},[151,1200,1201],{},"    test_score = lgr.score(X_test, y_test)\n",[151,1203,1204],{"class":153,"line":518},[151,1205,1206],{},"    print(\"Score with best parameteras : {}\".format(test_score))\n",[151,1208,1209],{"class":153,"line":523},[151,1210,170],{"emptyLinePlaceholder":169},[151,1212,1213],{"class":153,"line":529},[151,1214,1215],{},"grid_search_lgr(X_train, y_train, X_test, y_test, fname_lgr)\n",[219,1217],{"alt":221,"img-src":1218},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_35.png",[14,1220,1221],{},"以前と同様で、チューニングしてもあまり変わらない（変わる時もあった）。\nランダムフォレストもやってみる。",[142,1223,1225],{"className":144,"code":1224,"language":146,"meta":147,"style":147},"version = \"3\"\nfname_rf = \"RandomForesr_with_best_param_{}\".format(version) + \"_{}\".format(\".pickle\")\n\ndef grid_search_rdf(X_train, y_train, X_test, y_test, fname_rf):\n    # n_estimators → 決定木の数\n    # max_features → 決定木の特徴量の数\n    # random_state → 乱数シード\n    # max_depth → 決定木の深さ\n    # min_samples_split → 決定木を分割する際のサンプル数の最小数\n    parameter={\n        \"n_estimators\":[i  for i in range(10, 100, 10)],\n        \"criterion\":[\"gini\",\"entropy\"],\n        \"max_depth\":[i for i in range(1,10,1)],\n        \"min_samples_split\": [2, 4, 10,12,16],\n        \"random_state\":[3]\n    }\n\n    grid_search = GridSearchCV(RandomForestClassifier(), parameter, cv=5)\n    grid_search.fit(X_train, y_train)\n\n    print(\"Best parameters : {}\".format(grid_search.best_params_))\n    print(\"Best cross-validation score : {:.3f}\".format(grid_search.best_score_))\n\n    rf = RandomForestClassifier(**grid_search.best_params_)\n    rf.fit(X_train, y_train)\n    #fname = \"RandomForesr_with_best_param\" + \"{}\".format(\".pickle\")\n    with open(fname_rf, 'wb') as f:\n            pickle.dump(rf, f)\n    test_score = rf.score(X_test, y_test)\n    print(\"Score with best parameteras : {}\".format(test_score))\n\ngrid_search_rdf(X_train, y_train, X_test, y_test, fname_rf)\n",[133,1226,1227,1231,1236,1240,1245,1250,1255,1260,1265,1270,1275,1280,1285,1290,1295,1300,1305,1309,1314,1318,1322,1326,1330,1334,1339,1343,1348,1353,1358,1363,1367,1371],{"__ignoreMap":147},[151,1228,1229],{"class":153,"line":154},[151,1230,1120],{},[151,1232,1233],{"class":153,"line":160},[151,1234,1235],{},"fname_rf = \"RandomForesr_with_best_param_{}\".format(version) + \"_{}\".format(\".pickle\")\n",[151,1237,1238],{"class":153,"line":166},[151,1239,170],{"emptyLinePlaceholder":169},[151,1241,1242],{"class":153,"line":173},[151,1243,1244],{},"def grid_search_rdf(X_train, y_train, X_test, y_test, fname_rf):\n",[151,1246,1247],{"class":153,"line":179},[151,1248,1249],{},"    # n_estimators → 決定木の数\n",[151,1251,1252],{"class":153,"line":185},[151,1253,1254],{},"    # max_features → 決定木の特徴量の数\n",[151,1256,1257],{"class":153,"line":191},[151,1258,1259],{},"    # random_state → 乱数シード\n",[151,1261,1262],{"class":153,"line":196},[151,1263,1264],{},"    # max_depth → 決定木の深さ\n",[151,1266,1267],{"class":153,"line":202},[151,1268,1269],{},"    # min_samples_split → 決定木を分割する際のサンプル数の最小数\n",[151,1271,1272],{"class":153,"line":208},[151,1273,1274],{},"    parameter={\n",[151,1276,1277],{"class":153,"line":214},[151,1278,1279],{},"        \"n_estimators\":[i  for i in range(10, 100, 10)],\n",[151,1281,1282],{"class":153,"line":477},[151,1283,1284],{},"        \"criterion\":[\"gini\",\"entropy\"],\n",[151,1286,1287],{"class":153,"line":482},[151,1288,1289],{},"        \"max_depth\":[i for i in range(1,10,1)],\n",[151,1291,1292],{"class":153,"line":488},[151,1293,1294],{},"        \"min_samples_split\": [2, 4, 10,12,16],\n",[151,1296,1297],{"class":153,"line":494},[151,1298,1299],{},"        \"random_state\":[3]\n",[151,1301,1302],{"class":153,"line":500},[151,1303,1304],{},"    }\n",[151,1306,1307],{"class":153,"line":506},[151,1308,170],{"emptyLinePlaceholder":169},[151,1310,1311],{"class":153,"line":512},[151,1312,1313],{},"    grid_search = GridSearchCV(RandomForestClassifier(), parameter, cv=5)\n",[151,1315,1316],{"class":153,"line":518},[151,1317,1154],{},[151,1319,1320],{"class":153,"line":523},[151,1321,170],{"emptyLinePlaceholder":169},[151,1323,1324],{"class":153,"line":529},[151,1325,1163],{},[151,1327,1328],{"class":153,"line":534},[151,1329,1168],{},[151,1331,1332],{"class":153,"line":539},[151,1333,170],{"emptyLinePlaceholder":169},[151,1335,1336],{"class":153,"line":544},[151,1337,1338],{},"    rf = RandomForestClassifier(**grid_search.best_params_)\n",[151,1340,1341],{"class":153,"line":549},[151,1342,863],{},[151,1344,1345],{"class":153,"line":1043},[151,1346,1347],{},"    #fname = \"RandomForesr_with_best_param\" + \"{}\".format(\".pickle\")\n",[151,1349,1350],{"class":153,"line":1049},[151,1351,1352],{},"    with open(fname_rf, 'wb') as f:\n",[151,1354,1355],{"class":153,"line":1055},[151,1356,1357],{},"            pickle.dump(rf, f)\n",[151,1359,1360],{"class":153,"line":1061},[151,1361,1362],{},"    test_score = rf.score(X_test, y_test)\n",[151,1364,1365],{"class":153,"line":1067},[151,1366,1206],{},[151,1368,1369],{"class":153,"line":1073},[151,1370,170],{"emptyLinePlaceholder":169},[151,1372,1373],{"class":153,"line":1079},[151,1374,1375],{},"grid_search_rdf(X_train, y_train, X_test, y_test, fname_rf)\n",[219,1377],{"alt":221,"img-src":1378},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_36.png",[14,1380,1381],{},"こっちはだいぶよくなる。",[14,1383,1384],{},"検証セットでの結果はいまいちだが。",[10,1386,1387],{"id":1387},"モデルの評価",[14,1389,1390],{},"モデルの評価では、以下のことをみてみる。",[21,1392,1393,1396,1399],{},[24,1394,1395],{},"混合行列",[24,1397,1398],{},"適合率、再現率、F1 スコア",[24,1400,1401],{},"ROC 曲線",[14,1403,1404],{},"今回は次のようにこれらをまとめた関数でいっぺんに表示する。\nまずはロジスティック回帰。",[142,1406,1408],{"className":144,"code":1407,"language":146,"meta":147,"style":147},"from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc\n\ndef evaluation(model, X_test, y_test):\n    y_pred = model.predict(X_test)\n    m = confusion_matrix(y_test, y_pred)\n    print(\"Confusion matrix:\\n{}\".format(m))\n    print()\n    print(\"適合率：{:.3f}\".format(precision_score(y_test, y_pred)))\n    print(\"再現率：{:.3f}\".format(recall_score(y_test, y_pred)))\n    print(\"f1：{:.3f}\".format(f1_score(y_test, y_pred)))\n\n    y_pred = model.predict_proba(X_test)[:,1]\n    fpr, tpr, thresholds = roc_curve(y_test, y_pred)\n\n    auc_num = auc(fpr, tpr)\n\n    plt.plot(fpr, tpr, color=\"blue\", label=\"ROC curve (area = %.3f)\" % auc_num)\n    plt.plot([0,1], [0,1], color=\"black\", linestyle=\"--\")\n    plt.xlim([0.0, 1.0])\n    plt.ylim([0.0, 1.05])\n    plt.xlabel(\"False positive rate\")\n    plt.xlabel(\"True positive rate\")\n    plt.legend(loc=\"right lower\")\n\nwith open(fname_lgr, mode='rb') as fp:\n    lgr = pickle.load(fp)\nevaluation(lgr, X_test, y_test)\n",[133,1409,1410,1415,1419,1424,1429,1434,1439,1444,1449,1454,1459,1463,1468,1473,1477,1482,1486,1491,1496,1501,1506,1511,1516,1521,1525,1530,1535],{"__ignoreMap":147},[151,1411,1412],{"class":153,"line":154},[151,1413,1414],{},"from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc\n",[151,1416,1417],{"class":153,"line":160},[151,1418,170],{"emptyLinePlaceholder":169},[151,1420,1421],{"class":153,"line":166},[151,1422,1423],{},"def evaluation(model, X_test, y_test):\n",[151,1425,1426],{"class":153,"line":173},[151,1427,1428],{},"    y_pred = model.predict(X_test)\n",[151,1430,1431],{"class":153,"line":179},[151,1432,1433],{},"    m = confusion_matrix(y_test, y_pred)\n",[151,1435,1436],{"class":153,"line":185},[151,1437,1438],{},"    print(\"Confusion matrix:\\n{}\".format(m))\n",[151,1440,1441],{"class":153,"line":191},[151,1442,1443],{},"    print()\n",[151,1445,1446],{"class":153,"line":196},[151,1447,1448],{},"    print(\"適合率：{:.3f}\".format(precision_score(y_test, y_pred)))\n",[151,1450,1451],{"class":153,"line":202},[151,1452,1453],{},"    print(\"再現率：{:.3f}\".format(recall_score(y_test, y_pred)))\n",[151,1455,1456],{"class":153,"line":208},[151,1457,1458],{},"    print(\"f1：{:.3f}\".format(f1_score(y_test, y_pred)))\n",[151,1460,1461],{"class":153,"line":214},[151,1462,170],{"emptyLinePlaceholder":169},[151,1464,1465],{"class":153,"line":477},[151,1466,1467],{},"    y_pred = model.predict_proba(X_test)[:,1]\n",[151,1469,1470],{"class":153,"line":482},[151,1471,1472],{},"    fpr, tpr, thresholds = roc_curve(y_test, y_pred)\n",[151,1474,1475],{"class":153,"line":488},[151,1476,170],{"emptyLinePlaceholder":169},[151,1478,1479],{"class":153,"line":494},[151,1480,1481],{},"    auc_num = auc(fpr, tpr)\n",[151,1483,1484],{"class":153,"line":500},[151,1485,170],{"emptyLinePlaceholder":169},[151,1487,1488],{"class":153,"line":506},[151,1489,1490],{},"    plt.plot(fpr, tpr, color=\"blue\", label=\"ROC curve (area = %.3f)\" % auc_num)\n",[151,1492,1493],{"class":153,"line":512},[151,1494,1495],{},"    plt.plot([0,1], [0,1], color=\"black\", linestyle=\"--\")\n",[151,1497,1498],{"class":153,"line":518},[151,1499,1500],{},"    plt.xlim([0.0, 1.0])\n",[151,1502,1503],{"class":153,"line":523},[151,1504,1505],{},"    plt.ylim([0.0, 1.05])\n",[151,1507,1508],{"class":153,"line":529},[151,1509,1510],{},"    plt.xlabel(\"False positive rate\")\n",[151,1512,1513],{"class":153,"line":534},[151,1514,1515],{},"    plt.xlabel(\"True positive rate\")\n",[151,1517,1518],{"class":153,"line":539},[151,1519,1520],{},"    plt.legend(loc=\"right lower\")\n",[151,1522,1523],{"class":153,"line":544},[151,1524,170],{"emptyLinePlaceholder":169},[151,1526,1527],{"class":153,"line":549},[151,1528,1529],{},"with open(fname_lgr, mode='rb') as fp:\n",[151,1531,1532],{"class":153,"line":1043},[151,1533,1534],{},"    lgr = pickle.load(fp)\n",[151,1536,1537],{"class":153,"line":1049},[151,1538,1539],{},"evaluation(lgr, X_test, y_test)\n",[219,1541],{"alt":221,"img-src":1542},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_37.png",[14,1544,1545],{},"再現率低い。。",[14,1547,1548],{},"次はランダムフォレスト。",[142,1550,1552],{"className":144,"code":1551,"language":146,"meta":147,"style":147},"with open(fname_rf, mode='rb') as fp:\n    rf = pickle.load(fp)\nevaluation(rf, X_test, y_test)\n",[133,1553,1554,1559,1564],{"__ignoreMap":147},[151,1555,1556],{"class":153,"line":154},[151,1557,1558],{},"with open(fname_rf, mode='rb') as fp:\n",[151,1560,1561],{"class":153,"line":160},[151,1562,1563],{},"    rf = pickle.load(fp)\n",[151,1565,1566],{"class":153,"line":166},[151,1567,1568],{},"evaluation(rf, X_test, y_test)\n",[219,1570],{"alt":221,"img-src":1571},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_38.png",[14,1573,1574],{},"おっ。ランダムフォレストが負けてる？",[14,1576,1577],{},"再現率がさらに低いな。。",[14,1579,1580],{},"AUC でもわずかに劣ってる。\nただ、ROC の形状的にはランダムフォレストのほうがよさそう？？",[14,1582,1583],{},"なんだか予想がきかないデータが一定数ある気がする。外れ値的なやつかな。",[14,1585,1586],{},"いや、ランダムフォレストの結果をみると、ラベルに偏りがありそう。\nサンプリングのランダム性を収束させるために回数まわして平均とったほうがいいかも。",[10,1588,1590],{"id":1589},"kaggle-でテスト結果を提出してみた","Kaggle でテスト結果を提出してみた",[14,1592,1593],{},"やり残しというか、まだまだ考慮すべきことはあるが、ここまできたらテストデータに対するスコアをみてみたい。\nということで、Kaggle で初提出してみた。",[14,1595,1596,1597,1600,1601,1604],{},"コード中の",[133,1598,1599],{},"test_data","は",[133,1602,1603],{},"train_data","と同様の処理をして作った。\n（テストにしかない欠損があったのでそこは別途埋める必要あり）",[142,1606,1608],{"className":144,"code":1607,"language":146,"meta":147,"style":147},"result = lgr.predict(test_data)\n#result = rf.predict(test_data)\n\nsubmission = pd.DataFrame({\n        \"PassengerId\": test_data_copy[\"PassengerId\"],\n        \"Survived\": result\n    })\n\nsubmission.to_csv(\"submission.csv\", index=False)\n",[133,1609,1610,1615,1620,1624,1629,1634,1639,1644,1648],{"__ignoreMap":147},[151,1611,1612],{"class":153,"line":154},[151,1613,1614],{},"result = lgr.predict(test_data)\n",[151,1616,1617],{"class":153,"line":160},[151,1618,1619],{},"#result = rf.predict(test_data)\n",[151,1621,1622],{"class":153,"line":166},[151,1623,170],{"emptyLinePlaceholder":169},[151,1625,1626],{"class":153,"line":173},[151,1627,1628],{},"submission = pd.DataFrame({\n",[151,1630,1631],{"class":153,"line":179},[151,1632,1633],{},"        \"PassengerId\": test_data_copy[\"PassengerId\"],\n",[151,1635,1636],{"class":153,"line":185},[151,1637,1638],{},"        \"Survived\": result\n",[151,1640,1641],{"class":153,"line":191},[151,1642,1643],{},"    })\n",[151,1645,1646],{"class":153,"line":196},[151,1647,170],{"emptyLinePlaceholder":169},[151,1649,1650],{"class":153,"line":202},[151,1651,1652],{},"submission.to_csv(\"submission.csv\", index=False)\n",[14,1654,1655],{},"これで Commit して終了。",[14,1657,1658],{},"結果はこんな感じでした。",[1660,1661],"hr",{},[10,1663,1665,1666,1669],{"id":1664},"title-機械学習初心者がkaggleのtitanicで勉強してみたモデル評価編createdat-2019-03-10updatedat-2019-03-10tags-機械学習draft-falsedescription-thumbnail-imgtwitter-cardpng","title: 【機械学習】初心者がKaggleのtitanicで勉強してみた(モデル評価編)\ncreatedAt: '2019-03-10'\nupdatedAt: '2019-03-10'\ntags: ",[151,1667,1668],{},"'機械学習'","\ndraft: false\ndescription:  ''\nthumbnail: '\u002Fimg\u002Ftwitter-card.png'",[1671,1672,5],"h1",{"id":1673},"機械学習初心者がkaggleのtitanicで勉強してみたモデル評価編",[14,1675,1676],{},"結局、ランダムフォレストの方がよかった。",[14,1678,1679],{},"実は、前節でのスコアはたまたまで、数回試したらほとんどの場合ランダムフォレストのほうが性能がよかった。",[10,1681,1682],{"id":1682},"反省点",[14,1684,1685],{},"これまでの反省点まとめます。",[14,1687,1688],{},"まず前処理編で作ったパイプラインは作らなくてよかった。",[14,1690,1691],{},"それぞれクラスにする必要性は感じなかったし、クラスももっと柔軟に変更できるようにすべきだった。\nあのようなパイプラインを作った理由の一つに、データが DataFrame のままだとモデルに投入できないと思っていたことがある。\nしかし、DataFrame で普通にできることに途中で気づいた。",[14,1693,1694],{},"前処理と特徴量エンジニアリングで想像以上にデータ変更の機会が多く、ここは最初にそんなに作り込むべきではないと思った。\n少なくとも、素人が最初にやらないほうがいいと思う。",[14,1696,1697],{},"とりあえず、モデルの検証までやって、特徴量をいじりながら効率的な方法を模索しながらやったほうが時間がかからなかったように思える。\n慣れてくれば、汎用的な関数なりクラスを作ってのぞむ感じがよさそう。",[10,1699,1700],{"id":1700},"まとめ",[14,1702,1703],{},"ようやく Titanic 卒業しました。",[14,1705,1706],{},"機械学習の入門編ということでやってましたが、ひじょーーーーーーーに勉強になった。\nTitanic ちゃんとやるとほんとに難しいし、学びも多い。\nというかわからないことが多すぎた。",[14,1708,1709],{},"とくに特徴量エンジニアリングはかなり悩んだ。他の人の解説等をみてもどういう根拠に基づく処理なのかわからず、理解するのに時間がかかった。\n記事内には書いてないが、かなりいろいろ試してます笑",[14,1711,1712],{},"web 上のコードを調べると Name から Mr とかを抽出して特徴量にするアプローチがあったんですが、今回はやらないことにしました。\n理由は、自分でまったく思いつかなかったことで悔しかったからです笑",[14,1714,1715],{},"最終的なスコアが低いのかと思っていたが、これって問題の難しさに依存するのでそこまで気にしなくていいのではと思い始めた（あきらめ）。\nKaggle 内の順位とは関係なしに考えると、生存者予測で 8 割近い精度ならまあまあ予測できていると思う。",[14,1717,1718],{},"次は Deep なほうを勉強していきます！",[14,1720,1721],{},"参考書籍",[21,1723,1724,1731],{},[24,1725,1726],{},[27,1727,1730],{"href":1728,"rel":1729},"https:\u002F\u002Famzn.to\u002F38PNOK9",[268],"scikit-learn と TensorFlow による実践機械学習",[24,1732,1733],{},[27,1734,1737],{"href":1735,"rel":1736},"https:\u002F\u002Famzn.to\u002F2O246HK",[268],"Python ではじめる機械学習 ―scikit-learn で学ぶ特徴量エンジニアリングと機械学習の基礎",[1739,1740,1741],"style",{},"html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}",{"title":147,"searchDepth":160,"depth":160,"links":1743},[1744,1745,1746,1747,1748,1749,1751,1752],{"id":12,"depth":160,"text":12},{"id":73,"depth":160,"text":73},{"id":817,"depth":160,"text":817},{"id":1387,"depth":160,"text":1387},{"id":1589,"depth":160,"text":1590},{"id":1664,"depth":160,"text":1750},"title: 【機械学習】初心者がKaggleのtitanicで勉強してみた(モデル評価編)\ncreatedAt: '2019-03-10'\nupdatedAt: '2019-03-10'\ntags: '機械学習'\ndraft: false\ndescription:  ''\nthumbnail: '\u002Fimg\u002Ftwitter-card.png'",{"id":1682,"depth":160,"text":1682},{"id":1700,"depth":160,"text":1700},"2019-03-10",false,"md",{},"\u002Fcontents\u002Fml_titanic_part4",{"title":5,"description":147},"contents\u002Fml_titanic_part4",[1761],"機械学習","\u002Fimg\u002Ftwitter-card.png","vEAb_vmLO0_bFT1TU5BFlX06-4WRvM4HdagfVRLJ5uw",[1765,1769],{"title":1766,"path":1767,"stem":1768,"children":-1},"【機械学習】初心者がKaggleのtitanicで勉強してみた(アルゴリズム選定編)","\u002Fcontents\u002Fml_titanic_part3","contents\u002Fml_titanic_part3",{"title":1770,"path":1771,"stem":1772,"children":-1},"Multi-Agent Transactive Memory","\u002Fcontents\u002Fmulti-agent-transactive-memory","contents\u002Fmulti-agent-transactive-memory",1782055098839]