이전 칼럼 '투수의 디셉션을 정량적 평가하기(1)'에서는 Jon Anderson의 방법을 통해 투수의 패스트볼 디셉션을 정량화하는 방법을 살펴보았습니다. 이번 칼럼에서는 2023년 MLB 데이터를 사용하여 투수들의 디셉션을 더 깊이 분석해보겠습니다. 이 데이터 분석은 Python과 여러 데이터 분석 라이브러리를 활용하여, 투수들의 피칭 스타일과 디셉션을 정량적으로 평가하는 데 중점을 두고 있습니다.
import pandas as pd
import os
# CSV 파일이 있는 폴더 경로
folder_path = '/Users/kang/Desktop/baseball'
# 폴더 내 모든 CSV 파일 목록 가져오기
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
# 각 파일의 데이터프레임을 저장할 리스트
dataframes = []
# 각 파일을 순회하며 데이터프레임에 추가
for file in csv_files:
file_path = os.path.join(folder_path, file)
df = pd.read_csv(file_path)
dataframes.append(df)
# 모든 데이터프레임을 하나로 합치기
all_data = pd.concat(dataframes, ignore_index=True)
# 결과를 새로운 CSV 파일로 저장
all_data.to_csv('combined_data.csv', index=False)
import pandas as pd
all_data = pd.read_csv('combined_data.csv')
all_data.head()
import pandas as pd, numpy as np, os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sklearn
from sklearn.cluster import KMeans
from sklearn import preprocessing
warnings.filterwarnings('ignore')
pitch_color_dict = {'Sinker': 'indianred','4-Seam Fastball': 'red','2-Seam Fastball': 'indianred',
'Cutter': 'indianred','Forkball': 'indianred','Split Finger': 'indianred',
'Slider': 'darkorange','Curveball': 'blue','Knuckle Curve': 'blue',
'Changeup': 'darkgray','Eephus': 'cyan','Knuckle Ball': 'cyan'}
pa_flag_dict = {'field_out':1,'nan':0,'strikeout':1,'double':1,'strikeout_double_play':1,
'single':1,'force_out':1,'hit_by_pitch':1,'grounded_into_double_play':1,
'home_run':1,'walk':1,'caught_stealing_2b':0,'sac_bunt':1,'triple':1,
'sac_fly':1,'field_error':1,'double_play':1,'catcher_interf':0,'fielders_choice_out':1,
'fielders_choice':1,'pickoff_1b':0,'other_out':0,'caught_stealing_home':0,'pickoff_caught_stealing_2b':0,
'caught_stealing_3b':0,'sac_fly_double_play':1,'pickoff_caught_stealing_home':0,'pickoff_2b':0,'run':0,
'triple_play':1,'batter_interference':1,'pickoff_3b':0,'sac_bunt_double_play':1,'pickoff_caught_stealing_3b':0}
ab_flag_dict = {'field_out':1,'nan':0,'strikeout':1,'double':1,
'strikeout_double_play':1,'single':1,'force_out':1,'hit_by_pitch':0,
'grounded_into_double_play':1,'home_run':1,'walk':0,'caught_stealing_2b':0,
'sac_bunt':0,'triple':1,'sac_fly':0,'field_error':1,
'double_play':1,'catcher_interf':0,'fielders_choice_out':1,'fielders_choice':1,
'pickoff_1b':0,'other_out':0,'caught_stealing_home':0,'pickoff_caught_stealing_2b':0,
'caught_stealing_3b':0,'sac_fly_double_play':1,'pickoff_caught_stealing_home':0,'pickoff_2b':0,
'run':0,'triple_play':1,'batter_interference':1,'pickoff_3b':0,'sac_bunt_double_play':1,'pickoff_caught_stealing_3b':0}
is_hit_dict = {'field_out':0,'nan':0,'strikeout':0,'double':1,'strikeout_double_play':0,
'single':1,'force_out':0,'hit_by_pitch':0,'grounded_into_double_play':0,'home_run':1,
'walk':0,'caught_stealing_2b':0,'sac_bunt':0,'triple':1,'sac_fly':0,
'field_error':0,'double_play':0,'catcher_interf':0,'fielders_choice_out':0,'fielders_choice':0,
'pickoff_1,b':0,'other_out':0,'caught_stealing_home':0,'pickoff_caught_stealing_2b':0,'caught_stealing_3b':0,
'sac_fly_double_play':0,'pickoff_caught_stealing_home':0,'pickoff_2b':0,'run':0,'triple_play':0,'batter_interference':0,
'pickoff_3b':0,'sac_bunt_double_play':0,'pickoff_caught_stealing_3b':0}
swing_dict = {'ball':0,'foul_tip':1,'called_strike':0,'swinging_strike':1, 'pitchout': 0, 'bunt_foul_tip': 1,
'foul':1,'hit_into_play_no_out':1,'hit_into_play':1,'hit_into_play_score':1, 'missed_bunt': 1,
'hit_by_pitch':0,'blocked_ball':0,'swinging_strike_blocked':1, 'foul_bunt': 1}
contact_dict = {'ball':0,'foul_tip':1,'called_strike':0,'swinging_strike':0, 'pitchout': 0,
'foul':1,'hit_into_play_no_out':1,'hit_into_play':1, 'missed_bunt': 0,
'hit_into_play_score':1,'hit_by_pitch':0, 'bunt_foul_tip': 1,
'blocked_ball':0,'swinging_strike_blocked':0, 'foul_bunt': 1}
inplay_dict = {'ball':0,'foul_tip':0,'called_strike':0,'swinging_strike':0, 'pitchout': 0, 'bunt_foul_tip': 0,
'foul':0,'hit_into_play_no_out':1,'hit_into_play':1,'hit_into_play_score':1, 'missed_bunt': 0,
'hit_by_pitch':0,'blocked_ball':0,'swinging_strike_blocked':0, 'foul_bunt': 0}
all_data['game_date'] = pd.to_datetime(all_data['game_date'])
all_data = all_data.sort_values(by='game_date')
all_data['Rounded_Launch_Angle'] = round(all_data['launch_angle'],0)
all_data['PA_flag'] = all_data['events'].map(pa_flag_dict)
all_data['AB_flag'] = all_data['events'].map(ab_flag_dict)
all_data['Is_Hit'] = all_data['events'].map(is_hit_dict)
all_data['Is_Hit'] = all_data['Is_Hit'].fillna(0)
all_data['SwungOn'] = all_data['description'].map(swing_dict)
all_data['ContactMade'] = all_data['description'].map(contact_dict)
all_data['BallInPlay'] = all_data['description'].map(inplay_dict)
all_data = all_data.dropna(subset=['pitch_name'])
#all_data['zone'].value_counts()
firstpitches = all_data[(all_data['balls']==0) & (all_data['strikes']==0)]
firstpitches['zone'].value_counts()
def cswRate(df):
called_strikes = len(df[df['description']=='called_strike'])
swinging_strikes = len(df[df['description']=='swinging_strike']) + len(df[df['description']=='swinging_strike_blocked']) + len(df[df['description']=='missed_bunt'])
total_pitches = len(df)
called_strike_rate = called_strikes / total_pitches
swinging_strike_rate = swinging_strikes / total_pitches
csw_rate = (called_strikes + swinging_strikes) / total_pitches
if total_pitches < 99:
return(0)
return(csw_rate)
def swingRate(df):
swings = df['SwungOn'].sum()
pitches = len(df)
swing_rate = round((swings/pitches) * 100,2)
return swing_rate
firstpitch = all_data[(all_data['strikes']==0) & (all_data['balls']==0)]
one_zero = all_data[(all_data['strikes']==0) & (all_data['balls']==1)]
one_one = all_data[(all_data['strikes']==1) & (all_data['balls']==1)]
one_two = all_data[(all_data['strikes']==2) & (all_data['balls']==1)]
two_zero = all_data[(all_data['strikes']==0) & (all_data['balls']==2)]
swingRate(two_zero)
three_zero = all_data[(all_data['strikes']==0) & (all_data['balls']==3)]
swingRate(three_zero)
three_one = all_data[(all_data['strikes']==1) & (all_data['balls']==3)]
swingRate(three_one)
three_two = all_data[(all_data['strikes']==2) & (all_data['balls']==3)]
swingRate(three_two)
two_one = all_data[(all_data['strikes']==1) & (all_data['balls']==2)]
two_two = all_data[(all_data['strikes']==2) & (all_data['balls']==2)]
zero_one = all_data[(all_data['strikes']==1) & (all_data['balls']==0)]
zero_two = all_data[(all_data['strikes']==2) & (all_data['balls']==0)]
# 결과를 저장할 데이터프레임 초기화
result_df = pd.DataFrame(columns=['Count', 'Swing Rate'])
# 각 상황에 대한 결과 계산 및 데이터프레임에 추가
conditions = [
('All pitches', all_data),
('0-0 count', firstpitch),
('1-0 count', one_zero),
('1-1 count', one_one),
('1-2 count', one_two),
('2-0 count', two_zero),
('3-0 count', three_zero),
('3-1 count', three_one),
('3-2 count', three_two),
('2-1 count', two_one),
('2-2 count', two_two),
('0-1 count', zero_one),
('0-2 count', zero_two)
]
for label, condition in conditions:
swing_rate = swingRate(condition)
result_df = pd.concat([result_df, pd.DataFrame({'Count': [label], 'Swing Rate': [swing_rate]})], ignore_index=True)
# 결과 데이터프레임 출력
result_df.sort_values(by='Swing Rate',ascending=True)
Count | Swing Rate | |
---|---|---|
6 | 3-0 count | 9.71 |
1 | 0-0 count | 30.99 |
5 | 2-0 count | 42.57 |
2 | 1-0 count | 42.58 |
0 | All pitches | 47.49 |
11 | 0-1 count | 48.78 |
12 | 0-2 count | 51.38 |
7 | 3-1 count | 53.59 |
3 | 1-1 count | 54.19 |
4 | 1-2 count | 57.47 |
9 | 2-1 count | 57.56 |
10 | 2-2 count | 64.02 |
8 | 3-2 count | 70.05 |
대부분의 타자들은 3-0 count(3볼 0스트라이크) 상황에서 스윙을 선호하지 않기 때문에 분석 상황에서 제거함.
all_data = all_data[(all_data['strikes'] != 0) & (all_data['balls'] != 3)]
all_data = all_data.dropna(subset=['pitch_name'])
all_pitch_names = list(all_data['pitch_name'].unique())
for pitch in all_pitch_names:
print(pitch)
print(all_data['pitch_name'].value_counts(normalize=True).rename_axis('Pitch').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False))
print('\n')
all_data['pitch_name'].value_counts(normalize=True).rename_axis('Pitch').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).plot.bar(x="Pitch",y="Frequency")
all_fourseamers = all_data[all_data['pitch_name']=='4-Seam Fastball']
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11,3))
ax1.hist(all_fourseamers['release_speed'], bins=50)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('All four seamers thrown, velocity histogram')
ax1.set_xlim(85,103)
ax2.hist(all_fourseamers['release_spin_rate'], bins=50)
ax2.set_xlabel('release_spin_rate')
ax2.set_ylabel('Frequency')
ax2.set_title('All four seamers thrown, spin rate histogram')
ax2.set_xlim(1700,2800)
print('Average velocity: {}'.format(round(np.mean(all_fourseamers['release_speed']),2)))
print('Average spin rate: {}'.format(round(np.mean(all_fourseamers['release_spin_rate']))))
fb_data_w_pitcher = all_fourseamers[['player_name','description','release_speed','release_spin_rate']]
fb_data_w_pitcher = fb_data_w_pitcher[fb_data_w_pitcher['release_speed']>85].dropna()
fb_data = fb_data_w_pitcher[['release_speed','release_spin_rate']]
x = fb_data.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
fb_df_normalized = pd.DataFrame(x_scaled)
fb_df_normalized.index = fb_data.index
clustering_data = fb_df_normalized.values
fb_data_w_pitcher.head(5)
player_name | description | release_speed | release_spin_rate | |
---|---|---|---|---|
2289631 | De Los Santos, Enyel | hit_into_play | 92.8 | 2025.0 |
2289638 | Stephan, Trevor | ball | 92.8 | 2428.0 |
2289639 | Karinchak, James | foul | 95.7 | 2243.0 |
2289643 | De Los Santos, Enyel | foul | 93.4 | 2311.0 |
2289644 | Karinchak, James | ball | 95.8 | 2200.0 |
fb_df_normalized.head(5)
0 | 1 | |
---|---|---|
2289631 | 0.390863 | 0.491601 |
2289638 | 0.390863 | 0.690711 |
2289639 | 0.538071 | 0.599308 |
2289643 | 0.421320 | 0.632905 |
2289644 | 0.543147 | 0.578063 |
wcss = []
for i in range(1,11):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=10)
kmeans.fit(clustering_data)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('Elbow Plot to Find Optimal Number of Clusters')
plt.show
kmeans = KMeans(n_clusters=6, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(clustering_data)
plt.scatter(clustering_data[:,0], clustering_data[:,1])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.show()
fb_data_w_pitcher['Cluster'] = pred_y
cluster1 = fb_data_w_pitcher[fb_data_w_pitcher['Cluster']==0].sort_values(by='release_speed',ascending=False)
cluster2 = fb_data_w_pitcher[fb_data_w_pitcher['Cluster']==1].sort_values(by='release_speed',ascending=False)
cluster3 = fb_data_w_pitcher[fb_data_w_pitcher['Cluster']==2].sort_values(by='release_speed',ascending=False)
cluster4 = fb_data_w_pitcher[fb_data_w_pitcher['Cluster']==3].sort_values(by='release_speed',ascending=False)
cluster5 = fb_data_w_pitcher[fb_data_w_pitcher['Cluster']==4].sort_values(by='release_speed',ascending=False)
cluster6 = fb_data_w_pitcher[fb_data_w_pitcher['Cluster']==5].sort_values(by='release_speed',ascending=False)
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11,3))
ax1.hist(all_fourseamers['release_speed'], bins=50)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('All four seamers thrown, velocity histogram')
ax1.set_xlim(85,103)
ax2.hist(all_fourseamers['release_spin_rate'], bins=50)
ax2.set_xlabel('release_spin_rate')
ax2.set_ylabel('Frequency')
ax2.set_title('All four seamers thrown, spin rate histogram')
ax2.set_xlim(1700,2800)
print('Average velocity: {}'.format(round(np.mean(all_fourseamers['release_speed']),2)))
print('Average spin rate: {}'.format(round(np.mean(all_fourseamers['release_spin_rate']))))
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11,3))
ax1.hist(cluster1['release_speed'], bins=25)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('Cluster 1 four seamers thrown, velocity histogram')
ax1.set_xlim(85,103)
ax2.hist(cluster1['release_spin_rate'], bins=25)
ax2.set_xlabel('release_spin_rate')
ax2.set_ylabel('Frequency')
ax2.set_title('Cluster 1 four seamers thrown, spin rate histogram')
ax2.set_xlim(1700,2800)
print('Average velocity: {}'.format(round(np.mean(cluster1['release_speed']),2)))
print('Average spin rate: {}'.format(round(np.mean(cluster1['release_spin_rate']))))
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11,3))
ax1.hist(cluster2['release_speed'], bins=25)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('Cluster 2 four seamers thrown, velocity histogram')
ax1.set_xlim(85,103)
ax2.hist(cluster2['release_spin_rate'], bins=25)
ax2.set_xlabel('release_spin_rate')
ax2.set_ylabel('Frequency')
ax2.set_title('Cluster 2 four seamers thrown, spin rate histogram')
ax2.set_xlim(1700,2800)
print('Average velocity: {}'.format(round(np.mean(cluster2['release_speed']),2)))
print('Average spin rate: {}'.format(round(np.mean(cluster2['release_spin_rate']))))
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11,3))
ax1.hist(cluster3['release_speed'], bins=25)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('Cluster 3 four seamers thrown, velocity histogram')
ax1.set_xlim(85,103)
ax2.hist(cluster3['release_spin_rate'], bins=25)
ax2.set_xlabel('release_spin_rate')
ax2.set_ylabel('Frequency')
ax2.set_title('Cluster 3 four seamers thrown, spin rate histogram')
ax2.set_xlim(1700,2800)
print('Average velocity: {}'.format(round(np.mean(cluster3['release_speed']),2)))
print('Average spin rate: {}'.format(round(np.mean(cluster3['release_spin_rate']))))
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11,3))
ax1.hist(cluster4['release_speed'], bins=25)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('Cluster 4 four seamers thrown, velocity histogram')
ax1.set_xlim(85,103)
ax2.hist(cluster4['release_spin_rate'], bins=25)
ax2.set_xlabel('release_spin_rate')
ax2.set_ylabel('Frequency')
ax2.set_title('Cluster 4 four seamers thrown, spin rate histogram')
ax2.set_xlim(1700,2800)
print('Average velocity: {}'.format(round(np.mean(cluster4['release_speed']),2)))
print('Average spin rate: {}'.format(round(np.mean(cluster4['release_spin_rate']))))
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11,3))
ax1.hist(cluster5['release_speed'], bins=25)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('Cluster 5 four seamers thrown, velocity histogram')
ax1.set_xlim(85,103)
ax2.hist(cluster5['release_spin_rate'], bins=25)
ax2.set_xlabel('release_spin_rate')
ax2.set_ylabel('Frequency')
ax2.set_title('Cluster 5 four seamers thrown, spin rate histogram')
ax2.set_xlim(1700,2800)
print('Average velocity: {}'.format(round(np.mean(cluster5['release_speed']),2)))
print('Average spin rate: {}'.format(round(np.mean(cluster5['release_spin_rate']))))
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11,3))
ax1.hist(cluster6['release_speed'], bins=25)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('Cluster 6 four seamers thrown, velocity histogram')
ax1.set_xlim(85,103)
ax2.hist(cluster6['release_spin_rate'], bins=25)
ax2.set_xlabel('release_spin_rate')
ax2.set_ylabel('Frequency')
ax2.set_title('Cluster 6 four seamers thrown, spin rate histogram')
ax2.set_xlim(1700,2800)
print('Average velocity: {}'.format(round(np.mean(cluster6['release_speed']),2)))
print('Average spin rate: {}'.format(round(np.mean(cluster6['release_spin_rate']))))
print('Cluster 1 Top 3 Pitchers:')
print(cluster1['player_name'].value_counts(normalize=True).rename_axis('Pitcher').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).head(3))
print('\n')
print('Cluster 2 Top 3 Pitchers:')
print(cluster2['player_name'].value_counts(normalize=True).rename_axis('Pitcher').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).head(3))
print('\n')
print('Cluster 3 Top 3 Pitchers:')
print(cluster3['player_name'].value_counts(normalize=True).rename_axis('Pitcher').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).head(3))
print('\n')
print('Cluster 4 Top 3 Pitchers:')
print(cluster4['player_name'].value_counts(normalize=True).rename_axis('Pitcher').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).head(3))
print('\n')
print('Cluster 5 Top 3 Pitchers:')
print(cluster5['player_name'].value_counts(normalize=True).rename_axis('Pitcher').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).head(3))
print('\n')
print('Cluster 6 Top 3 Pitchers:')
print(cluster6['player_name'].value_counts(normalize=True).rename_axis('Pitcher').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).head(3))
print('\n')
clus1pitchers = cluster1['player_name'].unique()
clus1df = pd.DataFrame(columns=['Pitcher','PitchesThrown','CSW'])
for pitcher in clus1pitchers:
pitcher_df = cluster1[cluster1['player_name']==pitcher]
pitches_thrown = len(pitcher_df)
csw = cswRate(pitcher_df)
d = {"Pitcher": [pitcher], "PitchesThrown": [pitches_thrown], "CSW": [csw]}
temp_df = pd.DataFrame(d)
clus1df = pd.concat([clus1df, temp_df])
clus1df['Cluster'] = 1
clus2pitchers = cluster2['player_name'].unique()
clus2df = pd.DataFrame(columns=['Pitcher','PitchesThrown','CSW'])
for pitcher in clus2pitchers:
pitcher_df = cluster2[cluster2['player_name']==pitcher]
pitches_thrown = len(pitcher_df)
csw = cswRate(pitcher_df)
d = {"Pitcher": [pitcher], "PitchesThrown": [pitches_thrown], "CSW": [csw]}
temp_df = pd.DataFrame(d)
clus2df = pd.concat([clus2df, temp_df])
clus2df['Cluster'] = 2
clus3pitchers = cluster3['player_name'].unique()
clus3df = pd.DataFrame(columns=['Pitcher','PitchesThrown','CSW'])
for pitcher in clus3pitchers:
pitcher_df = cluster3[cluster3['player_name']==pitcher]
pitches_thrown = len(pitcher_df)
csw = cswRate(pitcher_df)
d = {"Pitcher": [pitcher], "PitchesThrown": [pitches_thrown], "CSW": [csw]}
temp_df = pd.DataFrame(d)
clus3df = pd.concat([clus3df, temp_df])
clus3df['Cluster'] = 3
clus4pitchers = cluster4['player_name'].unique()
clus4df = pd.DataFrame(columns=['Pitcher','PitchesThrown','CSW'])
for pitcher in clus4pitchers:
pitcher_df = cluster4[cluster4['player_name']==pitcher]
pitches_thrown = len(pitcher_df)
csw = cswRate(pitcher_df)
d = {"Pitcher": [pitcher], "PitchesThrown": [pitches_thrown], "CSW": [csw]}
temp_df = pd.DataFrame(d)
clus4df = pd.concat([clus4df, temp_df])
clus4df['Cluster'] = 4
clus5pitchers = cluster5['player_name'].unique()
clus5df = pd.DataFrame(columns=['Pitcher','PitchesThrown','CSW'])
for pitcher in clus5pitchers:
pitcher_df = cluster5[cluster5['player_name']==pitcher]
pitches_thrown = len(pitcher_df)
csw = cswRate(pitcher_df)
d = {"Pitcher": [pitcher], "PitchesThrown": [pitches_thrown], "CSW": [csw]}
temp_df = pd.DataFrame(d)
clus5df = pd.concat([clus5df, temp_df])
clus5df['Cluster'] = 5
clus6pitchers = cluster6['player_name'].unique()
clus6df = pd.DataFrame(columns=['Pitcher','PitchesThrown','CSW'])
for pitcher in clus6pitchers:
pitcher_df = cluster6[cluster6['player_name']==pitcher]
pitches_thrown = len(pitcher_df)
csw = cswRate(pitcher_df)
d = {"Pitcher": [pitcher], "PitchesThrown": [pitches_thrown], "CSW": [csw]}
temp_df = pd.DataFrame(d)
clus6df = pd.concat([clus6df, temp_df])
clus6df['Cluster'] = 6
final_df = pd.concat([clus1df, clus2df])
final_df = pd.concat([final_df, clus3df])
final_df = pd.concat([final_df, clus4df])
final_df = pd.concat([final_df, clus5df])
final_df = pd.concat([final_df, clus6df])
Average velocity: 95.75
Average spin rate: 2195
final_df[final_df['Cluster']==1].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Reid-Foley, Sean | 215 | 0.372093 | 1 |
0 | Vest, Will | 632 | 0.360759 | 1 |
0 | De León, José | 104 | 0.346154 | 1 |
0 | Jackson, Luke | 144 | 0.333333 | 1 |
0 | May, Trevor | 375 | 0.32 | 1 |
0 | Peralta, Wandy | 132 | 0.318182 | 1 |
0 | Puk, A.J. | 948 | 0.312236 | 1 |
0 | Speier, Gabe | 284 | 0.309859 | 1 |
0 | Kahnle, Tommy | 252 | 0.309524 | 1 |
0 | Winckowski, Josh | 190 | 0.289474 | 1 |
final_df[(final_df['Cluster']==1) & (final_df['PitchesThrown']>199)].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Reid-Foley, Sean | 215 | 0.372093 | 1 |
0 | Vest, Will | 632 | 0.360759 | 1 |
0 | May, Trevor | 375 | 0.32 | 1 |
0 | Puk, A.J. | 948 | 0.312236 | 1 |
0 | Speier, Gabe | 284 | 0.309859 | 1 |
0 | Kahnle, Tommy | 252 | 0.309524 | 1 |
0 | Hentges, Sam | 560 | 0.285714 | 1 |
0 | Leone, Dominic | 333 | 0.285285 | 1 |
0 | Flaherty, Jack | 270 | 0.277778 | 1 |
0 | Thompson, Zack | 270 | 0.277778 | 1 |
Reid-Foley, Sean
Average velocity: 93.12
Average spin rate: 2104
final_df[final_df['Cluster']==2].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Bickford, Phil | 116 | 0.362069 | 2 |
0 | Gibaut, Ian | 100 | 0.35 | 2 |
0 | Gilbert, Tyler | 104 | 0.346154 | 2 |
0 | Lauer, Eric | 160 | 0.325 | 2 |
0 | Miller, Shelby | 124 | 0.322581 | 2 |
0 | Jackson, Luke | 112 | 0.321429 | 2 |
0 | De Los Santos, Enyel | 110 | 0.318182 | 2 |
0 | Taylor, Josh | 128 | 0.3125 | 2 |
0 | Schreiber, John | 370 | 0.297297 | 2 |
0 | Gipson-Long, Sawyer | 108 | 0.296296 | 2 |
final_df[(final_df['Cluster']==2) & (final_df['PitchesThrown']>199)].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Schreiber, John | 370 | 0.297297 | 2 |
0 | Davidson, Tucker | 415 | 0.293976 | 2 |
0 | Fried, Max | 200 | 0.275 | 2 |
0 | Morgan, Eli | 240 | 0.270833 | 2 |
0 | Littell, Zack | 588 | 0.263605 | 2 |
0 | Cox, Austin | 324 | 0.259259 | 2 |
0 | López, Pablo | 340 | 0.258824 | 2 |
0 | Holton, Tyler | 284 | 0.253521 | 2 |
0 | Barnes, Matt | 208 | 0.25 | 2 |
0 | Ryan, Joe | 1252 | 0.239617 | 2 |
Schreiber, John
Average velocity: 98.48
Average spin rate: 2351
final_df[final_df['Cluster']==3].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | deGrom, Jacob | 448 | 0.366071 | 3 |
0 | Montero, Rafael | 195 | 0.358974 | 3 |
0 | De Los Santos, Enyel | 155 | 0.354839 | 3 |
0 | Ruiz, José | 148 | 0.351351 | 3 |
0 | Glasnow, Tyler | 448 | 0.348214 | 3 |
0 | Fairbanks, Pete | 816 | 0.318627 | 3 |
0 | Bautista, Félix | 1470 | 0.316327 | 3 |
0 | Guerra, Javy | 228 | 0.315789 | 3 |
0 | Kopech, Michael | 175 | 0.314286 | 3 |
0 | Scott, Tanner | 372 | 0.311828 | 3 |
final_df[(final_df['Cluster']==3) & (final_df['PitchesThrown']>199)].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | deGrom, Jacob | 448 | 0.366071 | 3 |
0 | Glasnow, Tyler | 448 | 0.348214 | 3 |
0 | Fairbanks, Pete | 816 | 0.318627 | 3 |
0 | Bautista, Félix | 1470 | 0.316327 | 3 |
0 | Guerra, Javy | 228 | 0.315789 | 3 |
0 | Scott, Tanner | 372 | 0.311828 | 3 |
0 | Williams, Gavin | 305 | 0.311475 | 3 |
0 | Brown, Hunter | 500 | 0.3 | 3 |
0 | Gausman, Kevin | 404 | 0.29703 | 3 |
0 | Finnegan, Kyle | 505 | 0.287129 | 3 |
deGrom, Jacob
Average velocity: 95.65
Average spin rate: 2469
final_df[final_df['Cluster']==4].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Gibaut, Ian | 125 | 0.4 | 4 |
0 | Akin, Keegan | 100 | 0.4 | 4 |
0 | Williams, Devin | 184 | 0.391304 | 4 |
0 | Phillips, Evan | 132 | 0.363636 | 4 |
0 | Turnbull, Spencer | 100 | 0.36 | 4 |
0 | Acton, Garrett | 155 | 0.354839 | 4 |
0 | Paxton, James | 170 | 0.352941 | 4 |
0 | Jax, Griffin | 212 | 0.339623 | 4 |
0 | Gausman, Kevin | 756 | 0.338624 | 4 |
0 | White, Brendan | 276 | 0.333333 | 4 |
final_df[(final_df['Cluster']==4) & (final_df['PitchesThrown']>199)].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Jax, Griffin | 212 | 0.339623 | 4 |
0 | Gausman, Kevin | 756 | 0.338624 | 4 |
0 | White, Brendan | 276 | 0.333333 | 4 |
0 | Diekman, Jake | 221 | 0.316742 | 4 |
0 | Perdomo, Angel | 380 | 0.315789 | 4 |
0 | Stratton, Chris | 382 | 0.311518 | 4 |
0 | Webb, Jacob | 858 | 0.311189 | 4 |
0 | Nardi, Andrew | 212 | 0.301887 | 4 |
0 | Romano, Jordan | 200 | 0.3 | 4 |
0 | Bellatti, Andrew | 200 | 0.3 | 4 |
Jax, Griffin
Average velocity: 93.17
Average spin rate: 2373
final_df[final_df['Cluster']==5].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Holton, Tyler | 136 | 0.441176 | 5 |
0 | Springs, Jeffrey | 104 | 0.384615 | 5 |
0 | Garcia, Robert | 135 | 0.37037 | 5 |
0 | Rea, Colin | 120 | 0.366667 | 5 |
0 | Lovelady, Richard | 180 | 0.361111 | 5 |
0 | Hudson, Bryan | 124 | 0.354839 | 5 |
0 | Miller, Shelby | 344 | 0.348837 | 5 |
0 | Ottavino, Adam | 145 | 0.344828 | 5 |
0 | Curtiss, John | 135 | 0.333333 | 5 |
0 | Moran, Jovani | 244 | 0.311475 | 5 |
final_df[(final_df['Cluster']==5) & (final_df['PitchesThrown']>199)].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Miller, Shelby | 344 | 0.348837 | 5 |
0 | Moran, Jovani | 244 | 0.311475 | 5 |
0 | Wentz, Joey | 432 | 0.305556 | 5 |
0 | Armstrong, Shawn | 212 | 0.301887 | 5 |
0 | Ryan, Joe | 1128 | 0.297872 | 5 |
0 | Strahm, Matt | 764 | 0.293194 | 5 |
0 | Maeda, Kenta | 252 | 0.285714 | 5 |
0 | Neris, Hector | 1215 | 0.279835 | 5 |
0 | Rogers, Trevor | 216 | 0.277778 | 5 |
0 | Cortes, Nestor | 888 | 0.277027 | 5 |
Miller, Shelby
Average velocity: 90.17
Average spin rate: 2195
final_df[final_df['Cluster']==6].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Carlton, Drew | 110 | 0.409091 | 6 |
0 | Banks, Tanner | 150 | 0.333333 | 6 |
0 | Abbott, Cory | 125 | 0.28 | 6 |
0 | Ryan, Joe | 724 | 0.276243 | 6 |
0 | Floro, Dylan | 132 | 0.272727 | 6 |
0 | Norris, Daniel | 185 | 0.27027 | 6 |
0 | Martinez, Seth | 505 | 0.267327 | 6 |
0 | Cox, Austin | 184 | 0.26087 | 6 |
0 | Rodriguez, Eduardo | 448 | 0.25 | 6 |
0 | Mikolas, Miles | 140 | 0.25 | 6 |
final_df[(final_df['Cluster']==6) & (final_df['PitchesThrown']>199)].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Ryan, Joe | 724 | 0.276243 | 6 |
0 | Martinez, Seth | 505 | 0.267327 | 6 |
0 | Rodriguez, Eduardo | 448 | 0.25 | 6 |
0 | Bradford, Cody | 932 | 0.240343 | 6 |
0 | Shreve, Chasen | 385 | 0.238961 | 6 |
0 | Alexander, Tyler | 404 | 0.227723 | 6 |
0 | Maton, Phil | 615 | 0.227642 | 6 |
0 | Maeda, Kenta | 572 | 0.223776 | 6 |
0 | Holton, Tyler | 328 | 0.219512 | 6 |
0 | Smeltzer, Devin | 292 | 0.219178 | 6 |
Ryan, Joe
#overall
final_df[final_df['PitchesThrown']>99].sort_values(by='CSW', ascending=False).head(25)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Holton, Tyler | 136 | 0.441176 | 5 |
0 | Carlton, Drew | 110 | 0.409091 | 6 |
0 | Gibaut, Ian | 125 | 0.4 | 4 |
0 | Akin, Keegan | 100 | 0.4 | 4 |
0 | Williams, Devin | 184 | 0.391304 | 4 |
0 | Springs, Jeffrey | 104 | 0.384615 | 5 |
0 | Reid-Foley, Sean | 215 | 0.372093 | 1 |
0 | Garcia, Robert | 135 | 0.37037 | 5 |
0 | Rea, Colin | 120 | 0.366667 | 5 |
0 | deGrom, Jacob | 448 | 0.366071 | 3 |
0 | Phillips, Evan | 132 | 0.363636 | 4 |
0 | Bickford, Phil | 116 | 0.362069 | 2 |
0 | Lovelady, Richard | 180 | 0.361111 | 5 |
0 | Vest, Will | 632 | 0.360759 | 1 |
0 | Turnbull, Spencer | 100 | 0.36 | 4 |
0 | Montero, Rafael | 195 | 0.358974 | 3 |
0 | Acton, Garrett | 155 | 0.354839 | 4 |
0 | Hudson, Bryan | 124 | 0.354839 | 5 |
0 | De Los Santos, Enyel | 155 | 0.354839 | 3 |
0 | Paxton, James | 170 | 0.352941 | 4 |
0 | Ruiz, José | 148 | 0.351351 | 3 |
0 | Gibaut, Ian | 100 | 0.35 | 2 |
0 | Miller, Shelby | 344 | 0.348837 | 5 |
0 | Glasnow, Tyler | 448 | 0.348214 | 3 |
0 | Gilbert, Tyler | 104 | 0.346154 | 2 |
all_sliders = all_data[all_data['pitch_name']=='Slider']
all_sliders['pfx_x2'] = all_sliders['pfx_x'].abs()
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(22,3))
ax1.hist(all_sliders['release_speed'], bins=50)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('All sliders thrown, velocity histogram')
ax1.set_xlim(72,95)
ax2.hist(all_sliders['pfx_x2'], bins=50)
ax2.set_xlabel('horizontal movement (in feet)')
ax2.set_ylabel('Frequency')
ax2.set_title('All sliders thrown, horizontal movement histogram')
ax2.set_xlim(0,2)
ax3.hist(all_sliders['pfx_z'], bins=50)
ax3.set_xlabel('vertical movement (in feet)')
ax3.set_ylabel('Frequency')
ax3.set_title('All sliders thrown, vertical movement histogram')
ax3.set_xlim(-1.5,1.5)
ax4.hist(all_sliders['release_spin_rate'], bins=50)
ax4.set_xlabel('spin rate')
ax4.set_ylabel('Frequency')
ax4.set_title('All sliders thrown, spin rate histogram')
ax4.set_xlim(1500,3500)
print('Average velocity: {}'.format(round(np.mean(all_sliders['release_speed']), 2)))
print('Average spin rate: {}'.format(round(np.mean(all_sliders['release_spin_rate']))))
print('Average horizontal movement: {}'.format(round(np.mean(all_sliders['pfx_x2']), 2)))
print('Average vertical movement: {}'.format(round(np.mean(all_sliders['pfx_z']), 2)))
slider_data_w_pitcher = all_sliders[['player_name','description','release_speed','release_spin_rate', 'pfx_x2', 'pfx_z']]
slider_data_w_pitcher = slider_data_w_pitcher[slider_data_w_pitcher['release_speed']>75].dropna()
slider_data = slider_data_w_pitcher[['release_speed','release_spin_rate', 'pfx_x2', 'pfx_z']]
x = slider_data.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
slider_df_normalized = pd.DataFrame(x_scaled)
slider_df_normalized.index = slider_data.index
clustering_data = slider_df_normalized.values
slider_data_w_pitcher.head(5)
player_name | description | release_speed | release_spin_rate | pfx_x2 | pfx_z | |
---|---|---|---|---|---|---|
2289634 | Bieber, Shane | ball | 84.5 | 2509.0 | 0.27 | 0.22 |
2289636 | Stephan, Trevor | swinging_strike | 84.8 | 2534.0 | 0.73 | 0.68 |
2289640 | Bieber, Shane | ball | 84.8 | 2470.0 | 0.19 | 0.23 |
2289641 | De Los Santos, Enyel | swinging_strike_blocked | 82.9 | 1845.0 | 0.11 | 0.09 |
2289642 | Stephan, Trevor | foul | 84.1 | 2631.0 | 0.83 | 0.20 |
slider_df_normalized.head(5)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
2289634 | 0.443396 | 0.672414 | 0.114407 | 0.524648 |
2289636 | 0.457547 | 0.681585 | 0.309322 | 0.686620 |
2289640 | 0.457547 | 0.658107 | 0.080508 | 0.528169 |
2289641 | 0.367925 | 0.428833 | 0.046610 | 0.478873 |
2289642 | 0.424528 | 0.717168 | 0.351695 | 0.517606 |
wcss = []
for i in range(1,11):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=10)
kmeans.fit(clustering_data)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('Elbow Plot to Find Optimal Number of Clusters')
plt.show
kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(clustering_data)
plt.scatter(clustering_data[:,0], clustering_data[:,1], clustering_data[:,2], clustering_data[:,3])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.show()
slider_data_w_pitcher['Cluster'] = pred_y
cluster1 = slider_data_w_pitcher[slider_data_w_pitcher['Cluster']==0].sort_values(by='pfx_x2',ascending=False)
cluster2 = slider_data_w_pitcher[slider_data_w_pitcher['Cluster']==1].sort_values(by='pfx_x2',ascending=False)
cluster3 = slider_data_w_pitcher[slider_data_w_pitcher['Cluster']==2].sort_values(by='pfx_x2',ascending=False)
cluster4 = slider_data_w_pitcher[slider_data_w_pitcher['Cluster']==3].sort_values(by='pfx_x2',ascending=False)
cluster5 = slider_data_w_pitcher[slider_data_w_pitcher['Cluster']==4].sort_values(by='pfx_x2',ascending=False)
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(22,3))
ax1.hist(all_sliders['release_speed'], bins=50)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('All sliders thrown, velocity histogram')
ax1.set_xlim(72,95)
ax2.hist(all_sliders['pfx_x2'], bins=50)
ax2.set_xlabel('horizontal movement (in feet)')
ax2.set_ylabel('Frequency')
ax2.set_title('All sliders thrown, horizontal movement histogram')
ax2.set_xlim(0,2)
ax3.hist(all_sliders['pfx_z'], bins=50)
ax3.set_xlabel('vertical movement (in feet)')
ax3.set_ylabel('Frequency')
ax3.set_title('All sliders thrown, vertical movement histogram')
ax3.set_xlim(-1.5,1.5)
ax4.hist(all_sliders['release_spin_rate'], bins=50)
ax4.set_xlabel('spin rate')
ax4.set_ylabel('Frequency')
ax4.set_title('All sliders thrown, spin rate histogram')
ax4.set_xlim(1500,3500)
print('Average velocity: {}'.format(round(np.mean(all_sliders['release_speed']), 2)))
print('Average spin rate: {}'.format(round(np.mean(all_sliders['release_spin_rate']))))
print('Average horizontal movement: {}'.format(round(np.mean(all_sliders['pfx_x2']), 2)))
print('Average vertical movement: {}'.format(round(np.mean(all_sliders['pfx_z']), 2)))
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(22,3))
ax1.hist(cluster1['release_speed'], bins=50)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('All sliders thrown, velocity histogram')
ax1.set_xlim(72,95)
ax2.hist(cluster1['pfx_x2'], bins=50)
ax2.set_xlabel('horizontal movement (in feet)')
ax2.set_ylabel('Frequency')
ax2.set_title('All sliders thrown, horizontal movement histogram')
ax2.set_xlim(0,2)
ax3.hist(cluster1['pfx_z'], bins=50)
ax3.set_xlabel('vertical movement (in feet)')
ax3.set_ylabel('Frequency')
ax3.set_title('All sliders thrown, vertical movement histogram')
ax3.set_xlim(-1.5,1.5)
ax4.hist(cluster1['release_spin_rate'], bins=50)
ax4.set_xlabel('spin rate')
ax4.set_ylabel('Frequency')
ax4.set_title('All sliders thrown, spin rate histogram')
ax4.set_xlim(1500,3500)
print('Average velocity: {}'.format(round(np.mean(cluster1['release_speed']), 2)))
print('Average spin rate: {}'.format(round(np.mean(cluster1['release_spin_rate']))))
print('Average horizontal movement: {}'.format(round(np.mean(cluster1['pfx_x2']), 2)))
print('Average vertical movement: {}'.format(round(np.mean(cluster1['pfx_z']), 2)))
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(22,3))
ax1.hist(cluster2['release_speed'], bins=50)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('All sliders thrown, velocity histogram')
ax1.set_xlim(72,95)
ax2.hist(cluster2['pfx_x2'], bins=50)
ax2.set_xlabel('horizontal movement (in feet)')
ax2.set_ylabel('Frequency')
ax2.set_title('All sliders thrown, horizontal movement histogram')
ax2.set_xlim(0,2)
ax3.hist(cluster2['pfx_z'], bins=50)
ax3.set_xlabel('vertical movement (in feet)')
ax3.set_ylabel('Frequency')
ax3.set_title('All sliders thrown, vertical movement histogram')
ax3.set_xlim(-1.5,1.5)
ax4.hist(cluster2['release_spin_rate'], bins=50)
ax4.set_xlabel('spin rate')
ax4.set_ylabel('Frequency')
ax4.set_title('All sliders thrown, spin rate histogram')
ax4.set_xlim(1500,3500)
print('Average velocity: {}'.format(round(np.mean(cluster2['release_speed']), 2)))
print('Average spin rate: {}'.format(round(np.mean(cluster2['release_spin_rate']))))
print('Average horizontal movement: {}'.format(round(np.mean(cluster2['pfx_x2']), 2)))
print('Average vertical movement: {}'.format(round(np.mean(cluster2['pfx_z']), 2)))
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(22,3))
ax1.hist(cluster3['release_speed'], bins=50)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('All sliders thrown, velocity histogram')
ax1.set_xlim(72,95)
ax2.hist(cluster3['pfx_x2'], bins=50)
ax2.set_xlabel('horizontal movement (in feet)')
ax2.set_ylabel('Frequency')
ax2.set_title('All sliders thrown, horizontal movement histogram')
ax2.set_xlim(0,2)
ax3.hist(cluster3['pfx_z'], bins=50)
ax3.set_xlabel('vertical movement (in feet)')
ax3.set_ylabel('Frequency')
ax3.set_title('All sliders thrown, vertical movement histogram')
ax3.set_xlim(-1.5,1.5)
ax4.hist(cluster3['release_spin_rate'], bins=50)
ax4.set_xlabel('spin rate')
ax4.set_ylabel('Frequency')
ax4.set_title('All sliders thrown, spin rate histogram')
ax4.set_xlim(1500,3500)
print('Average velocity: {}'.format(round(np.mean(cluster3['release_speed']), 2)))
print('Average spin rate: {}'.format(round(np.mean(cluster3['release_spin_rate']))))
print('Average horizontal movement: {}'.format(round(np.mean(cluster3['pfx_x2']), 2)))
print('Average vertical movement: {}'.format(round(np.mean(cluster3['pfx_z']), 2)))
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(22,3))
ax1.hist(cluster4['release_speed'], bins=50)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('All sliders thrown, velocity histogram')
ax1.set_xlim(72,95)
ax2.hist(cluster4['pfx_x2'], bins=50)
ax2.set_xlabel('horizontal movement (in feet)')
ax2.set_ylabel('Frequency')
ax2.set_title('All sliders thrown, horizontal movement histogram')
ax2.set_xlim(0,2)
ax3.hist(cluster4['pfx_z'], bins=50)
ax3.set_xlabel('vertical movement (in feet)')
ax3.set_ylabel('Frequency')
ax3.set_title('All sliders thrown, vertical movement histogram')
ax3.set_xlim(-1.5,1.5)
ax4.hist(cluster4['release_spin_rate'], bins=50)
ax4.set_xlabel('spin rate')
ax4.set_ylabel('Frequency')
ax4.set_title('All sliders thrown, spin rate histogram')
ax4.set_xlim(1500,3500)
print('Average velocity: {}'.format(round(np.mean(cluster4['release_speed']), 2)))
print('Average spin rate: {}'.format(round(np.mean(cluster4['release_spin_rate']))))
print('Average horizontal movement: {}'.format(round(np.mean(cluster4['pfx_x2']), 2)))
print('Average vertical movement: {}'.format(round(np.mean(cluster4['pfx_z']), 2)))
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(22,3))
ax1.hist(cluster5['release_speed'], bins=50)
ax1.set_xlabel('release_speed')
ax1.set_ylabel('Frequency')
ax1.set_title('All sliders thrown, velocity histogram')
ax1.set_xlim(72,95)
ax2.hist(cluster5['pfx_x2'], bins=50)
ax2.set_xlabel('horizontal movement (in feet)')
ax2.set_ylabel('Frequency')
ax2.set_title('All sliders thrown, horizontal movement histogram')
ax2.set_xlim(0,2)
ax3.hist(cluster5['pfx_z'], bins=50)
ax3.set_xlabel('vertical movement (in feet)')
ax3.set_ylabel('Frequency')
ax3.set_title('All sliders thrown, vertical movement histogram')
ax3.set_xlim(-1.5,1.5)
ax4.hist(cluster5['release_spin_rate'], bins=50)
ax4.set_xlabel('spin rate')
ax4.set_ylabel('Frequency')
ax4.set_title('All sliders thrown, spin rate histogram')
ax4.set_xlim(1500,3500)
print('Average velocity: {}'.format(round(np.mean(cluster5['release_speed']), 2)))
print('Average spin rate: {}'.format(round(np.mean(cluster5['release_spin_rate']))))
print('Average horizontal movement: {}'.format(round(np.mean(cluster5['pfx_x2']), 2)))
print('Average vertical movement: {}'.format(round(np.mean(cluster5['pfx_z']), 2)))
평균 :
Average velocity: 85.25
Average spin rate: 2420
Average horizontal movement: 0.52
Average vertical movement: 0.11
Cluster 1: 높은 구속, 높은 회전수, 낮은 횡 무브먼트, 낮은 종 무브먼트
Average velocity: 87.93
Average spin rate: 2567
Average horizontal movement: 0.38
Average vertical movement: 0.01
Cluster 2: 높은 구속, 매우 낮은 회전수, 낮은 횡 무브먼트, 높은 종 무브먼트
Average velocity: 87.72
Average spin rate: 2247
Average horizontal movement: 0.25
Average vertical movement: 0.44
Cluster 3: 매우 낮은 구속, 평균 회전수, 높은 횡 무브먼트, 낮은 종 무브먼트
Average velocity: 80.74
Average spin rate: 2480
Average horizontal movement: 1.21
Average vertical movement: 0.05
Cluster 4: 낮은 구속, 낮은 회전수, 낮은 횡 무브먼트, 낮은 종 무브먼트
Average velocity: 83.55
Average spin rate: 2316
Average horizontal movement: 0.31
Average vertical movement: -0.02
Cluster 5: 평균 구속, 높은 회전수, 매우 높은 횡 무브먼트, 낮은 종 무브먼트
Average velocity: 85.13
Average spin rate: 2629
Average horizontal movement: 0.89
Average vertical movement: -0.0
CSW Rates
Cluster 1: 24.57%
Cluster 2: 22.98%
Cluster 3: 23.29%
Cluster 4: 23.34%
Cluster 5: 23.69%
print(round(cswRate(cluster1)*100,2))
print(round(cswRate(cluster2)*100,2))
print(round(cswRate(cluster3)*100,2))
print(round(cswRate(cluster4)*100,2))
print(round(cswRate(cluster5)*100,2))
print('Cluster 1 Top 3 Pitchers:')
print(cluster1['player_name'].value_counts(normalize=True).rename_axis('Pitcher').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).head(3))
print('\n')
print('Cluster 2 Top 3 Pitchers:')
print(cluster2['player_name'].value_counts(normalize=True).rename_axis('Pitcher').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).head(3))
print('\n')
print('Cluster 3 Top 3 Pitchers:')
print(cluster3['player_name'].value_counts(normalize=True).rename_axis('Pitcher').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).head(3))
print('\n')
print('Cluster 4 Top 3 Pitchers:')
print(cluster4['player_name'].value_counts(normalize=True).rename_axis('Pitcher').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).head(3))
print('\n')
print('Cluster 5 Top 3 Pitchers:')
print(cluster5['player_name'].value_counts(normalize=True).rename_axis('Pitcher').reset_index(name='Frequency').sort_values(by='Frequency', ascending=False).head(3))
print('\n')
label1 = 'release_spin_rate'
label2 = 'pfx_x'
plt.scatter(all_sliders[label1], abs(all_sliders[label2]))
plt.title('%s vs. %s' %(label1,label2))
plt.xlabel(label1)
plt.ylabel(label2)
clus1pitchers = cluster1['player_name'].unique()
clus1df = pd.DataFrame(columns=['Pitcher','PitchesThrown','CSW'])
for pitcher in clus1pitchers:
pitcher_df = cluster1[cluster1['player_name']==pitcher]
pitches_thrown = len(pitcher_df)
csw = cswRate(pitcher_df)
d = {"Pitcher": [pitcher], "PitchesThrown": [pitches_thrown], "CSW": [csw]}
temp_df = pd.DataFrame(d)
clus1df = pd.concat([clus1df, temp_df])
clus1df['Cluster'] = 1
clus2pitchers = cluster2['player_name'].unique()
clus2df = pd.DataFrame(columns=['Pitcher','PitchesThrown','CSW'])
for pitcher in clus2pitchers:
pitcher_df = cluster2[cluster2['player_name']==pitcher]
pitches_thrown = len(pitcher_df)
csw = cswRate(pitcher_df)
d = {"Pitcher": [pitcher], "PitchesThrown": [pitches_thrown], "CSW": [csw]}
temp_df = pd.DataFrame(d)
clus2df = pd.concat([clus2df, temp_df])
clus2df['Cluster'] = 2
clus3pitchers = cluster3['player_name'].unique()
clus3df = pd.DataFrame(columns=['Pitcher','PitchesThrown','CSW'])
for pitcher in clus3pitchers:
pitcher_df = cluster3[cluster3['player_name']==pitcher]
pitches_thrown = len(pitcher_df)
csw = cswRate(pitcher_df)
d = {"Pitcher": [pitcher], "PitchesThrown": [pitches_thrown], "CSW": [csw]}
temp_df = pd.DataFrame(d)
clus3df = pd.concat([clus3df, temp_df])
clus3df['Cluster'] = 3
clus4pitchers = cluster4['player_name'].unique()
clus4df = pd.DataFrame(columns=['Pitcher','PitchesThrown','CSW'])
for pitcher in clus4pitchers:
pitcher_df = cluster4[cluster4['player_name']==pitcher]
pitches_thrown = len(pitcher_df)
csw = cswRate(pitcher_df)
d = {"Pitcher": [pitcher], "PitchesThrown": [pitches_thrown], "CSW": [csw]}
temp_df = pd.DataFrame(d)
clus4df = pd.concat([clus4df, temp_df])
clus4df['Cluster'] = 4
clus5pitchers = cluster5['player_name'].unique()
clus5df = pd.DataFrame(columns=['Pitcher','PitchesThrown','CSW'])
for pitcher in clus5pitchers:
pitcher_df = cluster5[cluster5['player_name']==pitcher]
pitches_thrown = len(pitcher_df)
csw = cswRate(pitcher_df)
d = {"Pitcher": [pitcher], "PitchesThrown": [pitches_thrown], "CSW": [csw]}
temp_df = pd.DataFrame(d)
clus5df = pd.concat([clus5df, temp_df])
clus5df['Cluster'] = 5
final_df = pd.concat([clus1df, clus2df])
final_df = pd.concat([final_df, clus3df])
final_df = pd.concat([final_df, clus4df])
final_df = pd.concat([final_df, clus5df])
Average velocity: 87.93
Average spin rate: 2567
Average horizontal movement: 0.38
Average vertical movement: 0.01
final_df[final_df['Cluster']==1].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Gallegos, Giovanny | 100 | 0.400000 | 1 |
0 | Strider, Spencer | 795 | 0.383648 | 1 |
0 | Stephenson, Robert | 304 | 0.365132 | 1 |
0 | Helsley, Ryan | 500 | 0.360000 | 1 |
0 | Romano, Jordan | 640 | 0.350000 | 1 |
0 | Selby, Colin | 290 | 0.344828 | 1 |
0 | Bellatti, Andrew | 140 | 0.342857 | 1 |
0 | Brash, Matt | 400 | 0.340000 | 1 |
0 | Alzolay, Adbert | 624 | 0.339744 | 1 |
0 | Uribe, Abner | 224 | 0.339286 | 1 |
Gallegos, Giovanny
Average velocity: 87.72
Average spin rate: 2247
Average horizontal movement: 0.25
Average vertical movement: 0.44
final_df[final_df['Cluster']==2].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Leclerc, José | 148 | 0.432432 | 2 |
0 | Helsley, Ryan | 210 | 0.428571 | 2 |
0 | Hader, Josh | 255 | 0.411765 | 2 |
0 | Hentges, Sam | 240 | 0.395833 | 2 |
0 | Santos, Gregory | 155 | 0.387097 | 2 |
0 | Grove, Michael | 156 | 0.384615 | 2 |
0 | Gallegos, Giovanny | 235 | 0.382979 | 2 |
0 | Kopech, Michael | 265 | 0.377358 | 2 |
0 | Jackson, Jay | 128 | 0.375000 | 2 |
0 | Gaddis, Hunter | 180 | 0.361111 | 2 |
Leclerc, José
Average velocity: 80.74
Average spin rate: 2480
Average horizontal movement: 1.21
Average vertical movement: 0.05
final_df[final_df['Cluster']==3].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Farmer, Buck | 110 | 0.545455 | 3 |
0 | Funderburk, Kody | 132 | 0.363636 | 3 |
0 | Castro, Miguel | 188 | 0.361702 | 3 |
0 | Keuchel, Dallas | 168 | 0.357143 | 3 |
0 | Weiss, Zack | 177 | 0.355932 | 3 |
0 | Soriano, George | 160 | 0.350000 | 3 |
0 | Bibee, Tanner | 105 | 0.333333 | 3 |
0 | Webb, Logan | 400 | 0.330000 | 3 |
0 | Strahm, Matt | 176 | 0.318182 | 3 |
0 | Peralta, Freddy | 424 | 0.301887 | 3 |
Farmer, Buck
Average velocity: 83.55
Average spin rate: 2316
Average horizontal movement: 0.31
Average vertical movement: -0.02
final_df[final_df['Cluster']==4].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Overton, Connor | 100 | 0.400000 | 4 |
0 | Strider, Spencer | 1420 | 0.383803 | 4 |
0 | Jiménez, Joe | 595 | 0.378151 | 4 |
0 | Zimmermann, Bruce | 180 | 0.361111 | 4 |
0 | Greene, Hunter | 240 | 0.354167 | 4 |
0 | Luzardo, Jesús | 1476 | 0.349593 | 4 |
0 | Saucedo, Tayler | 312 | 0.333333 | 4 |
0 | Santana, Dennis | 150 | 0.333333 | 4 |
0 | Nelson, Kyle | 280 | 0.328571 | 4 |
0 | Lambert, Jimmy | 260 | 0.326923 | 4 |
Overton, Connor
Average velocity: 85.13
Average spin rate: 2629
Average horizontal movement: 0.89
Average vertical movement: -0.0
final_df[final_df['Cluster']==5].sort_values(by='CSW', ascending=False).head(10)
Pitcher | PitchesThrown | CSW | Cluster | |
---|---|---|---|---|
0 | Strider, Spencer | 305 | 0.426230 | 5 |
0 | Romero, JoJo | 345 | 0.405797 | 5 |
0 | Chapman, Aroldis | 248 | 0.403226 | 5 |
0 | Chargois, JT | 116 | 0.379310 | 5 |
0 | Jameson, Drey | 240 | 0.366667 | 5 |
0 | Moreta, Dauri | 520 | 0.355769 | 5 |
0 | Brash, Matt | 1160 | 0.348276 | 5 |
0 | Stephenson, Robert | 140 | 0.342857 | 5 |
0 | Miller, Mason | 225 | 0.333333 | 5 |
0 | Herrin, Tim | 105 | 0.333333 | 5 |
Strider, Spencer