transcription
EDA + Baseline Model(0.40 RMSE).ipynb
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
로 변경하여 임포트pd.set_option('display.max_colwidth', -1)
train[pd.isnull(train)].sum()
train.isnull().sum()
로 작성하여 확인하는 방식으로 변경pickup_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 10,)
hm_wide = HeatMap( list(zip(pickup.pickup_latitude_round3.values, pickup.pickup_longitude_round3.values, pickup.Num_Trips.values)),
min_opacity=0.2,
radius=5, blur=15,
max_zoom=1
)
pickup_map.add_child(hm_wide)
pickup_map
pickup['Num_Trips'] = pickup['Num_Trips'].astype('float64')
으로 float으로 변환하여 사용pickup['Num_Trips'] = pickup['Num_Trips'].astype('float64')
pickup_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 10,)
hm_wide = HeatMap(list(zip(pickup['pickup_latitude_round3'].values,
pickup['pickup_longitude_round3'].values,
pickup['Num_Trips'].values)),
min_opacity=0.2,
radius=5, blur=15,
max_zoom=1)
pickup_map.add_child(hm_wide)
pickup_map
plt.figure(figsize=(8,5))
sns.countplot(train['pickup_day_of_week'],order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday'])
plt.figure(figsize=(8, 5))
sns.countplot(x=train['pickup_day_of_week'], order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.ylabel('Count')
방위각 계산
공식
계산
atan2
함수로 방위각 θ 계산def calculateBearing(lat1,lng1,lat2,lng2):
R = 6371
lng_delta_rad = np.radians(lng2 - lng1)
lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
y = np.sin(lng_delta_rad) * np.cos(lat2)
x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
return np.degrees(np.arctan2(y, x))
train['bearing']=train.apply(lambda row:calculateBearing(row['pickup_latitude_round3'],row['pickup_longitude_round3'],row['dropoff_latitude_round3'],row['dropoff_longitude_round3']),axis=1)
print(train['pickup_neighbourhood'].nunique())
pickup_neighbourhood
, dropoff_neighbourhood
에 대한 kmeans 객체가 정의되지 않아 발생한 문제test['pickup_datetime']=pd.to_datetime(test['pickup_datetime'],format='%Y-%m-%d %H:%M:%S')
#test['dropoff_datetime']=pd.to_datetime(test['dropoff_datetime'],format='%Y-%m-%d %H:%M:%S')
test['pickup_date']= test['pickup_datetime'].dt.date
test['pickup_day']=test['pickup_datetime'].apply(lambda x:x.day)
test['pickup_hour']=test['pickup_datetime'].apply(lambda x:x.hour)
test['pickup_day_of_week']=test['pickup_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
#test['dropoff_date']= test['dropoff_datetime'].dt.date
#test['dropoff_day']=test['dropoff_datetime'].apply(lambda x:x.day)
#test['dropoff_hour']=test['dropoff_datetime'].apply(lambda x:x.hour)
#test['dropoff_day_of_week']=test['dropoff_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
test['pickup_latitude_round3']=test['pickup_latitude'].apply(lambda x:round(x,3))
test['pickup_longitude_round3']=test['pickup_longitude'].apply(lambda x:round(x,3))
test['dropoff_latitude_round3']=test['dropoff_latitude'].apply(lambda x:round(x,3))
test['dropoff_longitude_round3']=test['dropoff_longitude'].apply(lambda x:round(x,3))
test['trip_distance']=test.apply(lambda row:calculateDistance(row),axis=1)
#test['trip_duration_in_hour']=test['trip_duration'].apply(lambda x:x/3600)
test['bearing']=test.apply(lambda row:calculateBearing(row['pickup_latitude_round3'],row['pickup_longitude_round3'],row['dropoff_latitude_round3'],row['dropoff_longitude_round3']),axis=1)
test.loc[:, 'pickup_neighbourhood'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test.loc[:, 'dropoff_neighbourhood'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])