df = pd.read_csv('data_v1.csv')
df
df.head()
df.tail()
df.info()
df.index
df.columns
df.values
df.isnull().sum()
df.describe()
<Data Preprocessing>
df.info()
df.drop('customerID', axis=1, inplace=True)
df.info()
df['TotalCharges']
df['TotalCharges'].astype(float) ---> WRONG
(df[TotalCharges'] == '') | (df[TotalCharges'] == ' ')
cond = (df['TotalCharges'] == '') | (df[Totalcharges'] == ' ')
df[cond]
df['TotalCharges'].replace([' '], ['0'], inplace=True)
df['TotalCharges'] = df['TotalCharges'].astype(float)
cond = (df['TotalCharges'] == '') | (df['TotalCharges'] == ' ')
df[cond]
df.info()
df['Churn'].value_counts()
df['churn'].replace[replace([Yes', 'No'], [1, 0], inplace=True)
df['Churn'].value_counts()
check existence of null data
df.isnull().sum()
df.drop('DeviceProtection', axis=1, inplace=True)
df.dropna(inplace=True)
df.isnull().sum()
df.info()
\< Visualization>
import matplotlib.pyplot as plt
%matplotlib inline
df['gender'].value_counts()
df['gender'].value_counts().plot(kind='bar')
df['Partner'].value_counts().plot(kind='bar')
df.select_dtypes('O').head(3)
df.select_dtypes('O').columns.values
object_list = df.select_dtypes('object').columns.values
for col in object_list:
df[col].value_counts().plot(kind='bar')
plt.title(col)
plt.show()
df.drop('PhoneService', axis=1, inplace=True)
df.select_dtypes('number').head(3)
df['Churn'].value_counts()
df['Churn'].value_counts().plot(kind='bar')
df['SeniorCitizen'].value_counts()
df['SeniorCitizen'].value_counts().plot(kind='bar')
df.drop('SeniorCitizen', axis=1, inplace=True)
df.info()
sns.histplot(data=df, x='tenure')
sns.histplot(data=df, x='tenure', hue='Churn')
sns.kdeplot(data=df, x='tenure', hue='Churn')
sns.histplot(data=df, x='TotalCharges')
sns.kdeplot(data=df, x='TotalCharges', hue='Churn')
sns.countplot(data=df, x='MultipleLines', hue='Churn')
df[['tenure', 'MonthlyCharges', 'TotalCharges']].corr()
sns.heatmap(df[['tenure', 'MonthlyCharges', 'TotalCharges']].corr(), annot=True)
-Boxplot
sns.boxplot(data=df, x='Churn', y='TotalCharges')
df.to_csv('data_v1_save.csv', index=False)
pd.read_csv('data_v1_save.csv').head()