2.1.12.2.Logistic regression with Python
Last updated
Last updated
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns %matplotlib inlinetrain = pd.read_csv('USA_Housing.csv')train.info()train.head(10)sns.heatmap(train.isnull(), yticklabels = False, cbar = False, cmap='viridis')sns.set_style('whitegrid')
sns.countplot(x = 'Survived', data = train)sns.countplot(x = 'Survived', hue = 'Sex', data = train, palette='RdBu_r')sns.countplot(x = 'Survived', hue = 'Embarked', data = train, palette='RdBu_r')sns.countplot(x = 'Survived', hue = 'Pclass', data = train, palette='RdBu_r')sns.distplot(train['Age'].dropna(), kde = False, bins = 30)sns.countplot(x = 'SibSp', data = train)train['Fare'].plot.hist(bins=40, figsize = (10, 4))import cufflinks as cf
cf.go_offline()
train['Fare'].iplot(bins=40)plt.figure(figsize=(10,7))
sns.boxplot(x='Pclass', y='Age',data=train)#將null值換成假資料
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
if pd.isnull(Age):
if Pclass == 1:
return 37
elif Pclass == 2:
return 29
else:
return 24
else:
return Age
train['Age'] = train[['Age', 'Pclass']].apply(impute_age, axis = 1)
#重新畫heat map
sns.heatmap(train.isnull(), yticklabels = False, cbar = False, cmap='viridis')train.drop('Cabin', axis = 1, inplace = True)
#drop後仍有一些NA
sns.heatmap(train.isnull(), yticklabels = False, cbar = False, cmap='viridis')train.dropna(inplace = True)
sns.heatmap(train.isnull(), yticklabels = False, cbar = False, cmap='viridis')#性別
#one hot 編碼
sex = pd.get_dummies(train['Sex'], drop_first=True)
sex.head()#登船港口
embark = pd.get_dummies(train['Embarked'], drop_first=True)
embark.head()train = pd.concat([train, sex, embark], axis = 1)
train.head()train.drop(['Sex', 'Embarked', 'Name', 'Ticket', 'PassengerId'], axis = 1, inplace = True)
train.head()X = train.drop('Survived', axis = 1)
Y = train['Survived']
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=101)from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
predictions = logmodel.predict(X_test) from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)