[{"data":1,"prerenderedAt":814},["ShallowReactive",2],{"content-\u002Fcontents\u002Fml_titanic_part2":3,"surroundPost-\u002Fcontents\u002Fml_titanic_part2":805},{"id":4,"title":5,"body":6,"createdAt":794,"description":83,"draft":795,"extension":796,"meta":797,"navigation":100,"path":798,"seo":799,"stem":800,"tags":801,"thumbnail":803,"updatedAt":794,"__hash__":804},"contents\u002Fcontents\u002Fml_titanic_part2.md","【機械学習】初心者がKaggleのtitanicで勉強してみた(前処理編)",{"type":7,"value":8,"toc":786},"minimark",[9,13,20,27,31,34,40,43,46,52,58,61,67,71,77,178,204,226,229,271,283,286,388,391,411,420,424,430,436,473,486,489,492,569,574,584,587,590,652,655,661,664,722,731,741,744,747,759,762,765,782],[10,11,12],"p",{},"初心者が Kaggle の Titanic をやってみた 2 回目前処理編やります！",[10,14,15],{},[16,17,19],"a",{"href":18},"ml_titanic","【機械学習】初心者が Kaggle の titanic で勉強してみた",[10,21,22,23,26],{},"前回は、データを眺めて、データの相関を確かめた。",[24,25],"br",{},"\n今回は、それらの情報から実際にデータを処理していく。",[28,29,30],"h2",{"id":30},"前処理",[10,32,33],{},"※ 自分の理解です。",[10,35,36,37,39],{},"前処理とは、与えられたデータを使ってモデルを作成する前にデータを適した形に直す処理のこと。",[24,38],{},"\n参考書を読むと、もっと詳しく体系的に書かれているが、ざっくり説明するとこんな感じだろうと思う。",[10,41,42],{},"では、なぜ前処理をするか？",[10,44,45],{},"一般的にデータは汚いからです！",[10,47,48,49,51],{},"汚いとは、言い換えると解析に適さない形、不要なものがたくさん含まれているということである。",[24,50],{},"\n例えば、今回でいうと欠損値であったり外れ値であったりといろいろある。",[10,53,54,55,57],{},"ちなみにこの Titanic のデータはとてもとても綺麗なデータです。",[24,56],{},"\n欠損値を補完して、カテゴリ変数を数値化したらモデルに入れられそう。",[28,59,60],{"id":60},"方針",[10,62,63,64,66],{},"前処理をやるにあたって今回は前処理パイプラインというものを試してみる。",[24,65],{},"\nパイプラインを使わずにもできるのだが、のちに再利用することや特徴量選択のことを考えて、パイプラインを使用する。",[28,68,70],{"id":69},"前処理数値","前処理：数値",[10,72,73,74,76],{},"まずは、数値データの処理からやっていく。",[24,75],{},"\nそこでデータから数値データのみを取得するクラスを定義する。",[78,79,84],"pre",{"className":80,"code":81,"language":82,"meta":83,"style":83},"language-py shiki shiki-themes github-dark","from sklearn.base import BaseEstimator, TransformerMixin\n\n## データフレームから指定したタイプの列を抽出\nclass DataFrameExtracter(BaseEstimator, TransformerMixin):\n    # dataTypeはnumberかobject\n    def __init__(self, dataType):\n        self.dataType = dataType\n\n    # 変換式を計算\n    def fit(self, X, y=None):\n        return self\n\n    # 変換式を用いてデータを変換\n    def transform(self, X, y=None):\n        return X.select_dtypes(include=[self.dataType])\n","py","",[85,86,87,95,102,108,114,120,126,132,137,143,149,155,160,166,172],"code",{"__ignoreMap":83},[88,89,92],"span",{"class":90,"line":91},"line",1,[88,93,94],{},"from sklearn.base import BaseEstimator, TransformerMixin\n",[88,96,98],{"class":90,"line":97},2,[88,99,101],{"emptyLinePlaceholder":100},true,"\n",[88,103,105],{"class":90,"line":104},3,[88,106,107],{},"## データフレームから指定したタイプの列を抽出\n",[88,109,111],{"class":90,"line":110},4,[88,112,113],{},"class DataFrameExtracter(BaseEstimator, TransformerMixin):\n",[88,115,117],{"class":90,"line":116},5,[88,118,119],{},"    # dataTypeはnumberかobject\n",[88,121,123],{"class":90,"line":122},6,[88,124,125],{},"    def __init__(self, dataType):\n",[88,127,129],{"class":90,"line":128},7,[88,130,131],{},"        self.dataType = dataType\n",[88,133,135],{"class":90,"line":134},8,[88,136,101],{"emptyLinePlaceholder":100},[88,138,140],{"class":90,"line":139},9,[88,141,142],{},"    # 変換式を計算\n",[88,144,146],{"class":90,"line":145},10,[88,147,148],{},"    def fit(self, X, y=None):\n",[88,150,152],{"class":90,"line":151},11,[88,153,154],{},"        return self\n",[88,156,158],{"class":90,"line":157},12,[88,159,101],{"emptyLinePlaceholder":100},[88,161,163],{"class":90,"line":162},13,[88,164,165],{},"    # 変換式を用いてデータを変換\n",[88,167,169],{"class":90,"line":168},14,[88,170,171],{},"    def transform(self, X, y=None):\n",[88,173,175],{"class":90,"line":174},15,[88,176,177],{},"        return X.select_dtypes(include=[self.dataType])\n",[10,179,180,181,184,185,188,189,191,192,194,195,197,198,200,201,203],{},"1 行目の",[85,182,183],{},"BaseEstimator, TransformerMixin","を import することで",[85,186,187],{},"fit_transform","などのメソッドが継承される。",[24,190],{},"\nこの",[85,193,187],{},"が継承されるとなぜ嬉しいかというと、のちに定義するパイプラインは",[85,196,187],{},"メソッドを順次呼び出していくからである。",[24,199],{},"\nなので、↑ のコードでは",[85,202,187],{},"を省略している。",[10,205,206,207,210,211,213,214,217,218,221,222,225],{},"入力は pandas の DataFrame を想定していて、",[85,208,209],{},"select_dtypes","で指定したタイプを取り出すことができる。",[24,212],{},"\npandas は数値型",[85,215,216],{},"int","、",[85,219,220],{},"float","などをまとめて",[85,223,224],{},"number","として指定してとりだせるのでマジ便利。",[10,227,228],{},"これで数値のみの列を取り出せる。次は、取り出した数値列のうち不要なものを削除するクラスを定義する。",[78,230,232],{"className":80,"code":231,"language":82,"meta":83,"style":83},"# 指定した列を削除\nclass DropAttributes(BaseEstimator, TransformerMixin):\n    def __init__(self, dropList):\n        self.dropList = dropList\n    def fit(self, X, y=None):\n        return self\n    def transform(self, X, y=None):\n        return X.drop(self.dropList, axis=1)\n",[85,233,234,239,244,249,254,258,262,266],{"__ignoreMap":83},[88,235,236],{"class":90,"line":91},[88,237,238],{},"# 指定した列を削除\n",[88,240,241],{"class":90,"line":97},[88,242,243],{},"class DropAttributes(BaseEstimator, TransformerMixin):\n",[88,245,246],{"class":90,"line":104},[88,247,248],{},"    def __init__(self, dropList):\n",[88,250,251],{"class":90,"line":110},[88,252,253],{},"        self.dropList = dropList\n",[88,255,256],{"class":90,"line":116},[88,257,148],{},[88,259,260],{"class":90,"line":122},[88,261,154],{},[88,263,264],{"class":90,"line":128},[88,265,171],{},[88,267,268],{"class":90,"line":134},[88,269,270],{},"        return X.drop(self.dropList, axis=1)\n",[10,272,273,274,276,277,279,280,282],{},"これも pandas の drop メソッドで指定した列を削除できる。",[24,275],{},"\nちなみに、列を削除するのはあまり安易にやらないほうがいいらしい。",[24,278],{},"\n不要だと思っていても実は重要だったりといったことが起こる可能性がある。",[24,281],{},"\n今回は、早く結果までたどり着きたいので、相関が低いやつはてきとうに落としていく。",[10,284,285],{},"不要な列も取り除いたので、次は欠損値をうめるクラスを定義する。",[78,287,289],{"className":80,"code":288,"language":82,"meta":83,"style":83},"# 欠損値の補完（中央値、平均などを用いる）\nclass FillNa(BaseEstimator, TransformerMixin):\n    def __init__(self, valueType='median', columns=[]):\n        self.valueType = valueType\n        self.columns = columns\n\n    def fit(self, X, y=None):\n        return self\n\n    def transform(self, X, y=None):\n        if self.valueType == 'median':\n            for col in self.columns:\n                if X[col].dtype != 'object':\n                    X[col] = X[col].fillna(X[col].median())\n            return X\n        elif self.valueType == 'mean':\n            for col in self.columns:\n                if X[col].dtype != 'object':\n                    X[col] = X[col].fillna(X[col].mean())\n            return X\n",[85,290,291,296,301,306,311,316,320,324,328,332,336,341,346,351,356,361,367,372,377,383],{"__ignoreMap":83},[88,292,293],{"class":90,"line":91},[88,294,295],{},"# 欠損値の補完（中央値、平均などを用いる）\n",[88,297,298],{"class":90,"line":97},[88,299,300],{},"class FillNa(BaseEstimator, TransformerMixin):\n",[88,302,303],{"class":90,"line":104},[88,304,305],{},"    def __init__(self, valueType='median', columns=[]):\n",[88,307,308],{"class":90,"line":110},[88,309,310],{},"        self.valueType = valueType\n",[88,312,313],{"class":90,"line":116},[88,314,315],{},"        self.columns = columns\n",[88,317,318],{"class":90,"line":122},[88,319,101],{"emptyLinePlaceholder":100},[88,321,322],{"class":90,"line":128},[88,323,148],{},[88,325,326],{"class":90,"line":134},[88,327,154],{},[88,329,330],{"class":90,"line":139},[88,331,101],{"emptyLinePlaceholder":100},[88,333,334],{"class":90,"line":145},[88,335,171],{},[88,337,338],{"class":90,"line":151},[88,339,340],{},"        if self.valueType == 'median':\n",[88,342,343],{"class":90,"line":157},[88,344,345],{},"            for col in self.columns:\n",[88,347,348],{"class":90,"line":162},[88,349,350],{},"                if X[col].dtype != 'object':\n",[88,352,353],{"class":90,"line":168},[88,354,355],{},"                    X[col] = X[col].fillna(X[col].median())\n",[88,357,358],{"class":90,"line":174},[88,359,360],{},"            return X\n",[88,362,364],{"class":90,"line":363},16,[88,365,366],{},"        elif self.valueType == 'mean':\n",[88,368,370],{"class":90,"line":369},17,[88,371,345],{},[88,373,375],{"class":90,"line":374},18,[88,376,350],{},[88,378,380],{"class":90,"line":379},19,[88,381,382],{},"                    X[col] = X[col].fillna(X[col].mean())\n",[88,384,386],{"class":90,"line":385},20,[88,387,360],{},[10,389,390],{},"欠損値の扱い方には自分が知る限りだと次の方法がある。",[392,393,394,403,408],"ul",{},[395,396,397,398,402],"li",{},"欠損値の補完に",[399,400,401],"strong",{},"中央値","を用いる",[395,404,397,405,402],{},[399,406,407],{},"平均値",[395,409,410],{},"欠損値のある行を削除する",[10,412,413,414,416,417,419],{},"なんとなく 2 番目が良さそうだと思ったが、参考書や他の人のコードでは中央値を用いていることが多い。",[24,415],{},"\n理由は、平均だと外れ値がある場合に、その値に引っ張られるからではないかと考えている。",[24,418],{},"\nこれはあとでしっかり検証したいと思う。",[28,421,423],{"id":422},"前処理カテゴリ変数","前処理：カテゴリ変数",[10,425,426,427,429],{},"データの中には男女やなにかの種類（商品名）など、複数の数値で表現できないカテゴリが存在しているものがある。",[24,428],{},"\nそれらは当然モデルに入力できないので、なんらかの数値に変換する。",[10,431,432,433,435],{},"よく使用されるのがダミー変数化だ。",[24,434],{},"\n今回はこのダミー変数化を行うクラスを作る。",[78,437,439],{"className":80,"code":438,"language":82,"meta":83,"style":83},"# カテゴリ変数をダミー変数化\nclass DummyCat(BaseEstimator, TransformerMixin):\n    def fit(self, X, y=None):\n        return self\n    def transform(self, X, y=None):\n        X = pd.get_dummies(X)\n        return X\n",[85,440,441,446,451,455,459,463,468],{"__ignoreMap":83},[88,442,443],{"class":90,"line":91},[88,444,445],{},"# カテゴリ変数をダミー変数化\n",[88,447,448],{"class":90,"line":97},[88,449,450],{},"class DummyCat(BaseEstimator, TransformerMixin):\n",[88,452,453],{"class":90,"line":104},[88,454,148],{},[88,456,457],{"class":90,"line":110},[88,458,154],{},[88,460,461],{"class":90,"line":116},[88,462,171],{},[88,464,465],{"class":90,"line":122},[88,466,467],{},"        X = pd.get_dummies(X)\n",[88,469,470],{"class":90,"line":128},[88,471,472],{},"        return X\n",[10,474,475,476,478,479,482,483,485],{},"なんとこれだけで実装できてしまう。",[24,477],{},"\nこれも pandas のメソッドで",[85,480,481],{},"get_dummies","を用いることで実現できる。",[24,484],{},"\nしかも、このメソッドの賢いところは、列名にカテゴリ名を含めてくれることだ。",[28,487,488],{"id":488},"前処理パイプライン",[10,490,491],{},"ここまでで必要なクラスの定義ができたので、実際にパイプラインを作っていく。",[78,493,495],{"className":80,"code":494,"language":82,"meta":83,"style":83},"rom sklearn.pipeline import Pipeline\nimport numpy as np\n\n# いらないと思われる列を指定\ndropNumList = ['PassengerId', 'Survived']\n\n# 数値列に対して欠損値の補完などを行う\nnum_pipeline = Pipeline([\n    ('selector', DataFrameExtracter(dataType='number')),\n    ('dropAtt', DropAttributes(dropList=dropNumList)),\n    ('fillNa', FillNa(valueType='median', columns=['Age'])),\n])\n\n# 前処理済み数値データ\nnum_data = num_pipeline.fit_transform(train_data)\n",[85,496,497,502,507,511,516,521,525,530,535,540,545,550,555,559,564],{"__ignoreMap":83},[88,498,499],{"class":90,"line":91},[88,500,501],{},"rom sklearn.pipeline import Pipeline\n",[88,503,504],{"class":90,"line":97},[88,505,506],{},"import numpy as np\n",[88,508,509],{"class":90,"line":104},[88,510,101],{"emptyLinePlaceholder":100},[88,512,513],{"class":90,"line":110},[88,514,515],{},"# いらないと思われる列を指定\n",[88,517,518],{"class":90,"line":116},[88,519,520],{},"dropNumList = ['PassengerId', 'Survived']\n",[88,522,523],{"class":90,"line":122},[88,524,101],{"emptyLinePlaceholder":100},[88,526,527],{"class":90,"line":128},[88,528,529],{},"# 数値列に対して欠損値の補完などを行う\n",[88,531,532],{"class":90,"line":134},[88,533,534],{},"num_pipeline = Pipeline([\n",[88,536,537],{"class":90,"line":139},[88,538,539],{},"    ('selector', DataFrameExtracter(dataType='number')),\n",[88,541,542],{"class":90,"line":145},[88,543,544],{},"    ('dropAtt', DropAttributes(dropList=dropNumList)),\n",[88,546,547],{"class":90,"line":151},[88,548,549],{},"    ('fillNa', FillNa(valueType='median', columns=['Age'])),\n",[88,551,552],{"class":90,"line":157},[88,553,554],{},"])\n",[88,556,557],{"class":90,"line":162},[88,558,101],{"emptyLinePlaceholder":100},[88,560,561],{"class":90,"line":168},[88,562,563],{},"# 前処理済み数値データ\n",[88,565,566],{"class":90,"line":174},[88,567,568],{},"num_data = num_pipeline.fit_transform(train_data)\n",[570,571],"img",{"alt":572,"img-src":573},"output","\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_7.png",[10,575,576,577,580,581,583],{},"このように呼び出したい順番に各クラスを記述していく。\n結果をみると",[85,578,579],{},"dropNumList","に書かれた列は削除されていることがわかる。",[24,582],{},"\n欠損値についても以下のようになくなっていることがわかる。",[570,585],{"alt":572,"img-src":586},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_8.png",[10,588,589],{},"カテゴリ変数についてもどうように処理していく。",[78,591,593],{"className":80,"code":592,"language":82,"meta":83,"style":83},"# いらないと思われるカテゴリ列を指定\ndropCatList = ['Name', 'Ticket', 'Cabin']\n\n# オブジェクトタイプん列に対してダミー変数化などを行う\ncat_pipeline = Pipeline([\n    ('selector', DataFrameExtracter(dataType='object')),\n    ('dropAtt', DropAttributes(dropList=dropCatList)),\n    ('dummy', DummyCat()),\n])\n\n# 前処理済みカテゴリ変数\ncat_data = cat_pipeline.fit_transform(train_data)\n",[85,594,595,600,605,609,614,619,624,629,634,638,642,647],{"__ignoreMap":83},[88,596,597],{"class":90,"line":91},[88,598,599],{},"# いらないと思われるカテゴリ列を指定\n",[88,601,602],{"class":90,"line":97},[88,603,604],{},"dropCatList = ['Name', 'Ticket', 'Cabin']\n",[88,606,607],{"class":90,"line":104},[88,608,101],{"emptyLinePlaceholder":100},[88,610,611],{"class":90,"line":110},[88,612,613],{},"# オブジェクトタイプん列に対してダミー変数化などを行う\n",[88,615,616],{"class":90,"line":116},[88,617,618],{},"cat_pipeline = Pipeline([\n",[88,620,621],{"class":90,"line":122},[88,622,623],{},"    ('selector', DataFrameExtracter(dataType='object')),\n",[88,625,626],{"class":90,"line":128},[88,627,628],{},"    ('dropAtt', DropAttributes(dropList=dropCatList)),\n",[88,630,631],{"class":90,"line":134},[88,632,633],{},"    ('dummy', DummyCat()),\n",[88,635,636],{"class":90,"line":139},[88,637,554],{},[88,639,640],{"class":90,"line":145},[88,641,101],{"emptyLinePlaceholder":100},[88,643,644],{"class":90,"line":151},[88,645,646],{},"# 前処理済みカテゴリ変数\n",[88,648,649],{"class":90,"line":157},[88,650,651],{},"cat_data = cat_pipeline.fit_transform(train_data)\n",[570,653],{"alt":572,"img-src":654},"\u002Fimg\u002Fkaggle_titanic\u002Ftitanic_output_9.png",[10,656,657,658,660],{},"結果からうまくカテゴリごとに数値化できたことが確認できる。",[24,659],{},"\n（男女は分けなくてもいい説）",[10,662,663],{},"ラストはこの二つを結合させる。",[78,665,667],{"className":80,"code":666,"language":82,"meta":83,"style":83},"from sklearn.pipeline import  FeatureUnion\n\n# 二つのパイプラインを結合する\nfull_pipeline = FeatureUnion(transformer_list=[(\"num_pipeline\", num_pipeline), (\"cat_pipeline\", cat_pipeline)])\ncleaned_data = full_pipeline.fit_transform(train_data)\ncolumns = list(num_data.columns) + list(cat_data.columns)\n\n# arrayと列の対応関係を保持\nmap_columns = {}\nfor idx, key in enumerate(columns):\n    map_columns[idx] = key\n",[85,668,669,674,678,683,688,693,698,702,707,712,717],{"__ignoreMap":83},[88,670,671],{"class":90,"line":91},[88,672,673],{},"from sklearn.pipeline import  FeatureUnion\n",[88,675,676],{"class":90,"line":97},[88,677,101],{"emptyLinePlaceholder":100},[88,679,680],{"class":90,"line":104},[88,681,682],{},"# 二つのパイプラインを結合する\n",[88,684,685],{"class":90,"line":110},[88,686,687],{},"full_pipeline = FeatureUnion(transformer_list=[(\"num_pipeline\", num_pipeline), (\"cat_pipeline\", cat_pipeline)])\n",[88,689,690],{"class":90,"line":116},[88,691,692],{},"cleaned_data = full_pipeline.fit_transform(train_data)\n",[88,694,695],{"class":90,"line":122},[88,696,697],{},"columns = list(num_data.columns) + list(cat_data.columns)\n",[88,699,700],{"class":90,"line":128},[88,701,101],{"emptyLinePlaceholder":100},[88,703,704],{"class":90,"line":134},[88,705,706],{},"# arrayと列の対応関係を保持\n",[88,708,709],{"class":90,"line":139},[88,710,711],{},"map_columns = {}\n",[88,713,714],{"class":90,"line":145},[88,715,716],{},"for idx, key in enumerate(columns):\n",[88,718,719],{"class":90,"line":151},[88,720,721],{},"    map_columns[idx] = key\n",[10,723,724,725,727,728,730],{},"これで二つのパイプラインを結合して一つにできた。",[24,726],{},"\nただ、もうちょっとまとめたい気もしている。",[24,729],{},"\n結局、二つのパイプラインをいちいち定義する工程が入るので、それもメッソド化するクラスを作っても良さそう。",[10,732,733,734,737,738,740],{},"得られた",[85,735,736],{},"cleaned_data","は numpy.ndarray の型に変換されている。",[24,739],{},"\nこれは、以降で用いる各手法の入力がベクトルを指定しているためである。",[10,742,743],{},"さて、これであとはデータをモデルにいれるだけの状態にできた！",[28,745,746],{"id":746},"まとめ",[10,748,749,750,752,753,755,756,758],{},"今回は前処理について書いた。",[24,751],{},"\n前処理は奥が深く自分が書いたような単純な事ばかりではなさそう。",[24,754],{},"\nただ、個人的には最初から深く立ち入ってもなんのためにやっているかわからなくなるので、まずは先に進むことを優先してみた。",[24,757],{},"\n実際、ぽんぽん前に進むと学習意欲が湧いてくる。",[10,760,761],{},"次は一番面白いと思われるモデル検証編をやります！",[10,763,764],{},"参考書籍・サイト等は前回と同様です！",[392,766,767,775],{},[395,768,769],{},[16,770,774],{"href":771,"rel":772},"https:\u002F\u002Famzn.to\u002F38PNOK9",[773],"nofollow","scikit-learn と TensorFlow による実践機械学習",[395,776,777],{},[16,778,781],{"href":779,"rel":780},"https:\u002F\u002Famzn.to\u002F2O246HK",[773],"Python ではじめる機械学習 ―scikit-learn で学ぶ特徴量エンジニアリングと機械学習の基礎",[783,784,785],"style",{},"html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}",{"title":83,"searchDepth":97,"depth":97,"links":787},[788,789,790,791,792,793],{"id":30,"depth":97,"text":30},{"id":60,"depth":97,"text":60},{"id":69,"depth":97,"text":70},{"id":422,"depth":97,"text":423},{"id":488,"depth":97,"text":488},{"id":746,"depth":97,"text":746},"2019-02-25",false,"md",{},"\u002Fcontents\u002Fml_titanic_part2",{"title":5,"description":83},"contents\u002Fml_titanic_part2",[802],"機械学習","\u002Fimg\u002Ftwitter-card.png","bN7P0iHi2-cqumuj_QYPcXHMa-eaz2UXKRWEaVLFqsg",[806,810],{"title":807,"path":808,"stem":809,"children":-1},"【機械学習】初心者がKaggleのtitanicで勉強してみた","\u002Fcontents\u002Fml_titanic","contents\u002Fml_titanic",{"title":811,"path":812,"stem":813,"children":-1},"【機械学習】初心者がKaggleのtitanicで勉強してみた(アルゴリズム選定編)","\u002Fcontents\u002Fml_titanic_part3","contents\u002Fml_titanic_part3",1782055100472]