π sklearnμ μ¬μ©νμ¬ λ°μ΄ν° νμ΅μμΌλ³΄κΈ°
μμ κ°μ νλ©΄μμ μ¬μ©μμ μΌκ΅΄ μ¬μ§μ λ£μΌλ©΄ μλμ μ 보λ₯Ό μ λ ₯νκ³ μ΄λ―Έμ§λ μ±λ³κ³Ό λμ΄λ₯Ό νμνλ λ€λ₯Έ μ΄λ―Έμ§ μΈμ λͺ¨λΈμ κ°μ Έμ μ¬μ©, μ¬μ©μ μ 보λ₯Ό μ λ ₯λ°μ μμ‘΄ μ¬λΆλ₯Ό μλ €μ£Όλ λ¨Έμ λ¬λμ μ μ©ν μ₯κ³ νλ‘μ νΈλ₯Ό μ§νμ€μ΄λ€.from pdb import post_mortem import pandas as pd import matplotlib.pyplot as plt from IPython.display import display from sklearn import svm from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split #training and testing data split from sklearn import metrics #accuracy measure import torch plt.style.use('fivethirtyeight') import warnings warnings.filterwarnings('ignore') # λ°μ΄ν°λ₯Ό λΆλ¬μ€κ³ 보μ¬μ€λ€. train_data=pd.read_csv('/titanic/train.csv') test_data=pd.read_csv('/titanic/test.csv') # data.head()λ μμ 5κ°λ§μ 보μ¬μ€λ€. # print(train_data.head()) for col in train_data.columns : msg = 'νλͺ© {:>10}\t λΉμ΄μλ μλ£μ λΉμ¨ : {:.2f}%'.format(col, 100 * (train_data[col].isnull().sum() / train_data[col].shape[0])) # print(msg) for col in test_data.columns : msg = 'νλͺ© {:>10}\t λΉμ΄μλ μλ£μ λΉμ¨ : {:.2f}%'.format(col, 100 * (test_data[col].isnull().sum() / test_data[col].shape[0])) # print(msg) train_data.isnull().sum() # print(train_data.isnull().sum()) # train_data μ΄ λΆλΆμ λΉμ΄μλ λ°μ΄ν° λͺ¨λ sum() νμ¬ λ³΄μ¬μ€ train_data['Initial']= train_data.Name.str.extract('([A-Za-z]+)\.') test_data['Initial']= test_data.Name.str.extract('([A-Za-z]+)\.') # print(test_data.Name.str.extract('([A-Za-z]+)\.')) # print(train_data.Name.str.extract('([A-Za-z]+)\.')) train_data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True) test_data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True) # λΌλ²¨μ λ°λΌ νκ· κ°μ λνλΈλ€. train_data.groupby('Initial')['Age'].mean() # print(train_data.groupby('Initial')['Age'].mean()) train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Mr'),'Age']=33 train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Mrs'),'Age']=36 train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Master'),'Age']=5 train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Miss'),'Age']=22 train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Other'),'Age']=46 test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Mr'),'Age'] = 33 test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Mrs'),'Age'] = 36 test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Master'),'Age'] = 5 test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Miss'),'Age'] = 22 test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Other'),'Age'] = 46 train_data.Age.isnull().any() test_data.Age.isnull().any() train_data['Embarked'].fillna('S',inplace=True) train_data['Age_band']=0 train_data.loc[train_data['Age']<=16,'Age_band']=0 train_data.loc[(train_data['Age']>16)&(train_data['Age']<=32),'Age_band']=1 train_data.loc[(train_data['Age']>32)&(train_data['Age']<=48),'Age_band']=2 train_data.loc[(train_data['Age']>48)&(train_data['Age']<=64),'Age_band']=3 train_data.loc[train_data['Age']>64,'Age_band']=4 train_data.head() #family size max=4 train_data['Family_Size']=0 train_data['Family_Size']=train_data['Parch']+train_data['SibSp'] #Alone train_data['Alone']=0 train_data.loc[train_data.Family_Size==0,'Alone']=1 train_data['Sex'].replace(['male','female'],[0,1],inplace=True) train_data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True) train_data['Initial'].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True) train_data.drop(['Name','Age','Ticket','Cabin','PassengerId','SibSp','Parch','Initial'],axis=1,inplace=True) train,test=train_test_split(train_data,test_size=0.3,random_state=0,stratify=train_data['Survived']) train_X=train[train.columns[1:]] train_Y=train[train.columns[:1]] test_X=test[test.columns[1:]] test_Y=test[test.columns[:1]] X=train_data[train_data.columns[1:]] Y=train_data['Survived'] model = LogisticRegression() model.fit(train_X,train_Y) prediction3=model.predict(test_X) test_x = [[1, 0, 10.0000, 1, 1, 1, 1]] print('The accuracy of the Logistic Regression is',metrics.accuracy_score(prediction3,test_Y)) print(model.predict(test_x)) test_test = model.predict(test_x)
https://welcome-to-dewy-world.tistory.com/4?category=913368
μ μ½λλ₯Ό μ°Έμ‘°νμ¬ μμ±νμλ€. λͺ¨λ μ€μ μ€ν κ²°κ³Όμ ν΄λΉ μ½λλ₯Ό μ μ¬μ©νμλμ§λ μ§κ΄μ μΌλ‘ μ μ μλ€. κ°μΈμ μΌλ‘ μ΄λ―Έμ§ μ²λ¦¬νλ μ½λλ 2μ°¨μ λ°°μ΄, 3μ°¨μκΉμ§ λ€λ£¨λ©° μ½λκ° μ§κ΄μ μ΄μ§ μμ μ΄ν΄νκΈ° λ무 νλ€μμ§λ§ μ΄λ―Έμ§ μ²λ¦¬κ° μλ λ°μ΄ν° νμ΅μ μν€λ κ²μ κ°λ¨ν λ°©λ²μ΄λ©΄ λλ¦ μ΄ν΄νκΈ°κ° μ½λ€.
μΌκΈ μΆμ²μ 10000κ°μ νμ΅λ²μ μ£Όλ§μ μκ°μ΄ λλ©΄ ν΄λ΄μΌκ² λ€.
μ μ½λμ μ νλκ° 4νΌμΌνΈμ λ μ°¨μ΄κ° λλ©° νλ‘μ νΈλ₯Ό μμ±νκ³ λλ©΄ μ½λλ₯Ό νλνλ λ€μ λ―μ΄λ΄μΌκ² λ€.κ°μΈμ μΌλ‘ μ²μμ C++μ 곡λΆνλ― μμ² λ§λ§νλλ° λ°μ΄ν° νμ΅μ μν€λ©° κ²°κ³Όκ° λμ€λκ² λ무 μ κΈ°νλ€.
test_x μ λ°μ΄ν°λ νλ‘μ νΈμμ μ¬μ©ν λ°μ΄ν° μ λ¬ λ°©μμ΄λ€ ν΄λΉ λ°°μ΄μ μ λ ₯λ°μ κ·Έλλ‘ λ£μ΄μ€ κ²μ΄λ€.
νΈμμ νμ΅μ μν₯μ λ§μ΄ μ£Όμ§λ§ μ¬λ―Έλ‘ ν΄λ³΄λ ν μ€νΈμ΄κΈ°μ νꡬ(νμΉμ§)λ νκ΅ νκ΅¬λ‘ μμλ‘ μ νμλ€.
π λ¬Έμ κ° μνλ¦°λ€ γ γ γ μκ°λ λλμΉ μμ λͺ»νλ€ γ γ μΌμ£ΌμΌ ν λΉμΉλ₯Ό λͺ» ν κ² κ°λ€... κΈμμΌ μ€μ κΉμ§ νλ‘μ νΈκ° μ§νλλ κΈμμΌκ³Ό μ£Όλ§μ λ§μ΄ νμ΄μΌκ² λ€...