실습. 코비선수 데이터 분석해보기 및 테스트

Updated:

  • https://github.com/baidoosik/kaggle-solving/tree/master/Kobe

데이터 분석

데이터 살펴보기

# import libraries
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

# display all columns
pd.set_option('display.max_columns', None)
data = pd.read_csv('data.csv')
data.head(3)
# max_columns를 설정했기 때문에 많은 column을 다 볼수 있다
action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining period playoffs season seconds_remaining shot_distance shot_made_flag shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent shot_id
0 Jump Shot Jump Shot 10 20000012 33.9723 167 72 -118.1028 10 1 0 2000-01 27 18 NaN 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 1
1 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 1 0 2000-01 22 15 0.0 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 2
2 Jump Shot Jump Shot 35 20000012 33.9093 -101 135 -118.3708 7 1 0 2000-01 45 16 1.0 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 3
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   action_type         30697 non-null  object 
 1   combined_shot_type  30697 non-null  object 
 2   game_event_id       30697 non-null  int64  
 3   game_id             30697 non-null  int64  
 4   lat                 30697 non-null  float64
 5   loc_x               30697 non-null  int64  
 6   loc_y               30697 non-null  int64  
 7   lon                 30697 non-null  float64
 8   minutes_remaining   30697 non-null  int64  
 9   period              30697 non-null  int64  
 10  playoffs            30697 non-null  int64  
 11  season              30697 non-null  object 
 12  seconds_remaining   30697 non-null  int64  
 13  shot_distance       30697 non-null  int64  
 14  shot_made_flag      25697 non-null  float64
 15  shot_type           30697 non-null  object 
 16  shot_zone_area      30697 non-null  object 
 17  shot_zone_basic     30697 non-null  object 
 18  shot_zone_range     30697 non-null  object 
 19  team_id             30697 non-null  int64  
 20  team_name           30697 non-null  object 
 21  game_date           30697 non-null  object 
 22  matchup             30697 non-null  object 
 23  opponent            30697 non-null  object 
 24  shot_id             30697 non-null  int64  
dtypes: float64(3), int64(11), object(11)
memory usage: 5.9+ MB
# datatype -> category, object
# 데이터타입을 사용하기 좋게 변경시켜준다
data['action_type'] = data['action_type'].astype('object')
data['combined_shot_type'] = data['combined_shot_type'].astype('category')
data['game_event_id'] = data['game_event_id'].astype('category')
data['game_id'] = data['game_id'].astype('category')
data['period'] = data['period'].astype('object')
data['playoffs'] = data['playoffs'].astype('category')
data['season'] = data['season'].astype('category')
data['shot_made_flag'] = data['shot_made_flag'].astype('category')
data['shot_type'] = data['shot_type'].astype('category')
data['team_id'] = data['team_id'].astype('category')
data.set_index('shot_id', inplace = True)
# shot_id로 인덱스를 설정한다
data.head(2)
action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining period playoffs season seconds_remaining shot_distance shot_made_flag shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent
shot_id
1 Jump Shot Jump Shot 10 20000012 33.9723 167 72 -118.1028 10 1 0 2000-01 27 18 NaN 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR
2 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 1 0 2000-01 22 15 0.0 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR
data.describe(include=['number'])
lat loc_x loc_y lon minutes_remaining seconds_remaining shot_distance
count 30697.000000 30697.000000 30697.000000 30697.000000 30697.000000 30697.000000 30697.000000
mean 33.953192 7.110499 91.107535 -118.262690 4.885624 28.365085 13.437437
std 0.087791 110.124578 87.791361 0.110125 3.449897 17.478949 9.374189
min 33.253300 -250.000000 -44.000000 -118.519800 0.000000 0.000000 0.000000
25% 33.884300 -68.000000 4.000000 -118.337800 2.000000 13.000000 5.000000
50% 33.970300 0.000000 74.000000 -118.269800 5.000000 28.000000 15.000000
75% 34.040300 95.000000 160.000000 -118.174800 8.000000 43.000000 21.000000
max 34.088300 248.000000 791.000000 -118.021800 11.000000 59.000000 79.000000
data.describe(include=['category', 'object'])
action_type combined_shot_type game_event_id game_id period playoffs season shot_made_flag shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent
count 30697 30697 30697 30697 30697 30697 30697 25697.0 30697 30697 30697 30697 30697 30697 30697 30697 30697
unique 57 6 620 1559 7 2 20 2.0 2 6 7 5 1 1 1559 74 33
top Jump Shot Jump Shot 2 21501228 3 0 2005-06 0.0 2PT Field Goal Center(C) Mid-Range Less Than 8 ft. 1610612747 Los Angeles Lakers 2016-04-13 LAL @ SAS SAS
freq 18880 23485 132 50 8296 26198 2318 14232.0 24271 13455 12625 9398 30697 30697 50 1020 1978

데이터 분석 및 시각화

train = data.dropna(how='any')
# any 어느 한 컬럼만 비어있어도 지워준다

def bar_chart(feature, ax=None):
    success = train[train['shot_made_flag']==1][feature].value_counts()
    fail = train[train['shot_made_flag']==0][feature].value_counts()
    df = pd.DataFrame([success, fail])
    df.index = ['Success', 'Fail']
    df.plot(kind = 'bar', stacked= True, ax=ax)
ax = plt.axes()
ax.set_title('shot made')
bar_chart('shot_made_flag',ax)
plt.show()

농구_14_0

  • 데이터에 큰 차이가 없다면 명시적으로 숫자로 확인하자
print(train['shot_made_flag'].value_counts() / len(train.index))
0.0    0.553839
1.0    0.446161
Name: shot_made_flag, dtype: float64
  • (위도, 경도),(x, y) 등 짝지었을 때 의미있는 데이터 Seaborn 라이브러리의 pairplot을 이용한 시각화
sns.pairplot(train, vars=['loc_x','loc_y','lat','lon','shot_distance'], hue='shot_made_flag', size=3)
plt.show()

농구_18_0

  • Category 데이터의 항목이 많은 경우 stack방식이 아닌 Seaborn 라이브러리의 countplot() 함수를 이용
def count_plot(column, ax):
    sns.countplot(x=column, hue='shot_made_flag', data=train, ax=ax)
f, axrr = plt.subplots(8, figsize=(15,30))

categorical_data=['combined_shot_type','season','period','playoffs','shot_type','shot_zone_area','shot_zone_basic','shot_zone_range']

for idx, category_data in enumerate(categorical_data,0):
    count_plot(category_data, axrr[idx])
    axrr[idx].set_title(category_data)
    
plt.tight_layout()
plt.show
<function matplotlib.pyplot.show(*args, **kw)>

농구_21_1

  • 값이 비슷해 보일 때는 명시적으로 숫자로 보자
def print_probability(column):
    print(train[train['shot_made_flag']==1][column].value_counts()/(train[train['shot_made_flag']==1][column].value_counts()+train[train['shot_made_flag']==0][column].value_counts()))
for categoty_data in categorical_data:
    print(print_probability(categoty_data))
Bank Shot    0.791667
Dunk         0.928030
Hook Shot    0.535433
Jump Shot    0.391071
Layup        0.565093
Tip Shot     0.348684
Name: combined_shot_type, dtype: float64
None
1996-97    0.422977
1997-98    0.430864
1998-99    0.458824
1999-00    0.460366
2000-01    0.466667
2001-02    0.458431
2002-03    0.436285
2003-04    0.433260
2004-05    0.436557
2005-06    0.453742
2006-07    0.457885
2007-08    0.468389
2008-09    0.467855
2009-10    0.453725
2010-11    0.446417
2011-12    0.425847
2012-13    0.457831
2013-14    0.406780
2014-15    0.376054
2015-16    0.356223
Name: season, dtype: float64
None
1    0.465672
2    0.448802
3    0.453442
4    0.413702
5    0.442857
6    0.466667
7    0.428571
Name: period, dtype: float64
None
0    0.446420
1    0.444651
Name: playoffs, dtype: float64
None
2PT Field Goal    0.477348
3PT Field Goal    0.329268
Name: shot_type, dtype: float64
None
Back Court(BC)           0.013889
Center(C)                0.525556
Left Side Center(LC)     0.361177
Left Side(L)             0.396871
Right Side Center(RC)    0.382567
Right Side(R)            0.401658
Name: shot_zone_area, dtype: float64
None
Above the Break 3        0.329237
Backcourt                0.016667
In The Paint (Non-RA)    0.454381
Left Corner 3            0.370833
Mid-Range                0.406286
Restricted Area          0.618004
Right Corner 3           0.339339
Name: shot_zone_basic, dtype: float64
None
16-24 ft.          0.401766
24+ ft.            0.332513
8-16 ft.           0.435484
Back Court Shot    0.013889
Less Than 8 ft.    0.573120
Name: shot_zone_range, dtype: float64
None
  • continuous 한 데이터들 시각화 facet_grid를 이용
def draw_facetgrid(feature):
    facet = sns.FacetGrid(train, hue='shot_made_flag',aspect=5)
    facet.map(sns.kdeplot, feature, shade=True)
    facet.set(xlim=(0, train[feature].max()))
    # survived 라벨을 표시.
    facet.add_legend()
    plt.show()
draw_facetgrid('minutes_remaining')

농구_27_0

draw_facetgrid('seconds_remaining')

농구_28_0

  • group_by 함수를 이용해 두개 column을 합쳐서 분석하기
train['shot_made_flag'] = train['shot_made_flag'].astype('int64')
train.groupby(['season','combined_shot_type'])['shot_made_flag'].sum()/(train.groupby(['season','combined_shot_type'])['shot_made_flag'].count())
season   combined_shot_type
1996-97  Bank Shot                  NaN
         Dunk                  0.947368
         Hook Shot                  NaN
         Jump Shot             0.380567
         Layup                 0.450450
                                 ...   
2015-16  Dunk                  1.000000
         Hook Shot             0.272727
         Jump Shot             0.327711
         Layup                 0.623529
         Tip Shot                   NaN
Name: shot_made_flag, Length: 120, dtype: float64

Feature Engineering

Data cleaning

  • Featureing 단계에서 필요없는 데이터들을 삭제 작업을 시작하기 전에 정리
data_cp = data.copy()
target = data_cp['shot_made_flag'].copy()

# 코비는 하나의 팀에서만 활동했기때문에 team_id, team_name이 의미없다
data_cp.drop('team_id', axis=1, inplace=True)
data_cp.drop('team_name', axis=1, inplace=True)

# lat, lon -> loc_x, loc_y 로 대치가능
data_cp.drop('lat', axis=1, inplace=True)
data_cp.drop('lon', axis=1, inplace=True)

# game_id, game_event_id are independent
data_cp.drop('game_id', axis=1, inplace=True)
data_cp.drop('game_event_id', axis=1, inplace=True)

data_cp.drop('shot_made_flag', axis=1, inplace= True)
data_cp.head(2)
action_type combined_shot_type loc_x loc_y minutes_remaining period playoffs season seconds_remaining shot_distance shot_type shot_zone_area shot_zone_basic shot_zone_range game_date matchup opponent
shot_id
1 Jump Shot Jump Shot 167 72 10 1 0 2000-01 27 18 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. 2000-10-31 LAL @ POR POR
2 Jump Shot Jump Shot -157 0 10 1 0 2000-01 22 15 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 2000-10-31 LAL @ POR POR

Data Transformation

  • 의미 있는 데이터 즉 feature로 변형
  • 시각화를 통해 5초 이하의 시간이 남았을 때 특설을 찾음
# 시간 그래프를 보면 시간이 적게 남았을 때 안들어갈 확률이 높음
data_cp['seconds_from_period_end'] = 60 * data_cp['minutes_remaining'] + data_cp['seconds_remaining']

data_cp['last_5_sec_in_period'] = data_cp['seconds_from_period_end']<5

# 사용한 시간 컬럼을 지워준다
data_cp.drop('minutes_remaining', axis=1, inplace=True)
data_cp.drop('seconds_remaining', axis=1, inplace=True)
data_cp.drop('seconds_from_period_end', axis=1, inplace=True)

## home, away mapping
data_cp['home_away'] = data_cp['matchup'].str.contains('vs').astype('int')
data_cp.drop('matchup', axis=1, inplace=True)
data_cp.head(1)
action_type combined_shot_type loc_x loc_y period playoffs season shot_distance shot_type shot_zone_area shot_zone_basic shot_zone_range game_date opponent last_5_sec_in_period home_away
shot_id
1 Jump Shot Jump Shot 167 72 1 0 2000-01 18 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. 2000-10-31 POR False 0
# game data 년/월/일

data_cp['game_date'] = pd.to_datetime(data_cp['game_date'])
data_cp['game_year'] = data_cp['game_date'].dt.year
data_cp['game_month'] = data_cp['game_date'].dt.month

data_cp.drop('game_date', axis = 1, inplace=True)
data_cp.head(2)
action_type combined_shot_type loc_x loc_y period playoffs season shot_distance shot_type shot_zone_area shot_zone_basic shot_zone_range opponent last_5_sec_in_period home_away game_year game_month
shot_id
1 Jump Shot Jump Shot 167 72 1 0 2000-01 18 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. POR False 0 2000 10
2 Jump Shot Jump Shot -157 0 1 0 2000-01 15 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. POR False 0 2000 10
# loc_x, loc_y binding 25 단위로

data_cp['loc_x'] = pd.cut(data_cp['loc_x'],25)
data_cp['loc_y'] = pd.cut(data_cp['loc_y'],25)
# 1~2개의 데이터는 의미가 없다
# 인덱싱
data_cp.action_type.value_counts()
Jump Shot                             18880
Layup Shot                             2567
Driving Layup Shot                     1978
Turnaround Jump Shot                   1057
Fadeaway Jump Shot                     1048
Running Jump Shot                       926
Pullup Jump shot                        476
Turnaround Fadeaway shot                439
Slam Dunk Shot                          411
Reverse Layup Shot                      395
Jump Bank Shot                          333
Driving Dunk Shot                       310
Dunk Shot                               262
Tip Shot                                182
Alley Oop Dunk Shot                     122
Step Back Jump shot                     118
Floating Jump shot                      114
Driving Reverse Layup Shot               97
Hook Shot                                84
Driving Finger Roll Shot                 82
Alley Oop Layup shot                     80
Reverse Dunk Shot                        75
Running Layup Shot                       72
Turnaround Bank shot                     71
Driving Finger Roll Layup Shot           69
Driving Slam Dunk Shot                   48
Running Bank shot                        48
Running Hook Shot                        41
Finger Roll Layup Shot                   33
Fadeaway Bank shot                       31
Driving Jump shot                        28
Finger Roll Shot                         28
Jump Hook Shot                           24
Running Dunk Shot                        19
Reverse Slam Dunk Shot                   16
Putback Layup Shot                       15
Follow Up Dunk Shot                      15
Driving Hook Shot                        14
Turnaround Hook Shot                     14
Pullup Bank shot                         12
Running Reverse Layup Shot               11
Running Finger Roll Layup Shot            6
Cutting Layup Shot                        6
Hook Bank Shot                            5
Driving Bank shot                         5
Driving Floating Jump Shot                5
Putback Dunk Shot                         5
Running Finger Roll Shot                  4
Running Pull-Up Jump Shot                 4
Turnaround Finger Roll Shot               2
Tip Layup Shot                            2
Putback Slam Dunk Shot                    2
Running Tip Shot                          2
Running Slam Dunk Shot                    1
Cutting Finger Roll Layup Shot            1
Driving Floating Bank Jump Shot           1
Turnaround Fadeaway Bank Jump Shot        1
Name: action_type, dtype: int64
rare_action_types = data_cp['action_type'].value_counts().sort_values().index.values[:20]
rare_action_types
array(['Turnaround Fadeaway Bank Jump Shot', 'Running Slam Dunk Shot',
       'Driving Floating Bank Jump Shot',
       'Cutting Finger Roll Layup Shot', 'Running Tip Shot',
       'Putback Slam Dunk Shot', 'Tip Layup Shot',
       'Turnaround Finger Roll Shot', 'Running Pull-Up Jump Shot',
       'Running Finger Roll Shot', 'Putback Dunk Shot',
       'Driving Floating Jump Shot', 'Driving Bank shot',
       'Hook Bank Shot', 'Cutting Layup Shot',
       'Running Finger Roll Layup Shot', 'Running Reverse Layup Shot',
       'Pullup Bank shot', 'Turnaround Hook Shot', 'Driving Hook Shot'],
      dtype=object)
data_cp.loc[data_cp['action_type'].isin(rare_action_types), 'action_type'] = 'Other'
data_cp.head(2)
action_type combined_shot_type loc_x loc_y period playoffs season shot_distance shot_type shot_zone_area shot_zone_basic shot_zone_range opponent last_5_sec_in_period home_away game_year game_month
shot_id
1 Jump Shot Jump Shot (148.4, 168.32] (56.2, 89.6] 1 0 2000-01 18 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. POR False 0 2000 10
2 Jump Shot Jump Shot (-170.32, -150.4] (-10.6, 22.8] 1 0 2000-01 15 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. POR False 0 2000 10
categorial_cols = {'action_type', 'combined_shot_type','period', 'season','shot_type','shot_zone_area', 'shot_zone_basic','shot_zone_range','game_year','game_month','opponent','loc_x','loc_y'}
pd.get_dummies?
Convert categorical variable into dummy/indicator variables.

Parameters
----------
data : array-like, Series, or DataFrame
    Data of which to get dummy indicators.
prefix : str, list of str, or dict of str, default None
    String to append DataFrame column names.
    Pass a list with length equal to the number of columns
    when calling get_dummies on a DataFrame. Alternatively, `prefix`
    can be a dictionary mapping column names to prefixes.
prefix_sep : str, default '_'
    If appending prefix, separator/delimiter to use. Or pass a
    list or dictionary as with `prefix`.
dummy_na : bool, default False
    Add a column to indicate NaNs, if False NaNs are ignored.
columns : list-like, default None
    Column names in the DataFrame to be encoded.
    If `columns` is None then all the columns with
    `object` or `category` dtype will be converted.
sparse : bool, default False
    Whether the dummy-encoded columns should be backed by
    a :class:`SparseArray` (True) or a regular NumPy array (False).
drop_first : bool, default False
    Whether to get k-1 dummies out of k categorical levels by removing the
    first level.
dtype : dtype, default np.uint8
    Data type for new columns. Only a single dtype is allowed.

    .. versionadded:: 0.23.0

Returns
-------
DataFrame
    Dummy-coded data.

See Also
--------
Series.str.get_dummies : Convert Series to dummy codes.

Examples
--------
>>> s = pd.Series(list('abca'))

>>> pd.get_dummies(s)
   a  b  c
0  1  0  0
1  0  1  0
2  0  0  1
3  1  0  0

>>> s1 = ['a', 'b', np.nan]

>>> pd.get_dummies(s1)
   a  b
0  1  0
1  0  1
2  0  0

>>> pd.get_dummies(s1, dummy_na=True)
   a  b  NaN
0  1  0    0
1  0  1    0
2  0  0    1

>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
...                    'C': [1, 2, 3]})

>>> pd.get_dummies(df, prefix=['col1', 'col2'])
   C  col1_a  col1_b  col2_a  col2_b  col2_c
0  1       1       0       0       1       0
1  2       0       1       1       0       0
2  3       1       0       0       0       1

>>> pd.get_dummies(pd.Series(list('abcaa')))
   a  b  c
0  1  0  0
1  0  1  0
2  0  0  1
3  1  0  0
4  1  0  0

>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
   b  c
0  0  0
1  1  0
2  0  1
3  0  0
4  0  0

>>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
     a    b    c
0  1.0  0.0  0.0
1  0.0  1.0  0.0
2  0.0  0.0  1.0
File:      c:\users\user\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py
Type:      function
data_cp.join?
Join columns of another DataFrame.

Join columns with `other` DataFrame either on index or on a key
column. Efficiently join multiple DataFrame objects by index at once by
passing a list.

Parameters
----------
other : DataFrame, Series, or list of DataFrame
    Index should be similar to one of the columns in this one. If a
    Series is passed, its name attribute must be set, and that will be
    used as the column name in the resulting joined DataFrame.
on : str, list of str, or array-like, optional
    Column or index level name(s) in the caller to join on the index
    in `other`, otherwise joins index-on-index. If multiple
    values given, the `other` DataFrame must have a MultiIndex. Can
    pass an array as the join key if it is not already contained in
    the calling DataFrame. Like an Excel VLOOKUP operation.
how : {'left', 'right', 'outer', 'inner'}, default 'left'
    How to handle the operation of the two objects.

    * left: use calling frame's index (or column if on is specified)
    * right: use `other`'s index.
    * outer: form union of calling frame's index (or column if on is
      specified) with `other`'s index, and sort it.
      lexicographically.
    * inner: form intersection of calling frame's index (or column if
      on is specified) with `other`'s index, preserving the order
      of the calling's one.
lsuffix : str, default ''
    Suffix to use from left frame's overlapping columns.
rsuffix : str, default ''
    Suffix to use from right frame's overlapping columns.
sort : bool, default False
    Order result DataFrame lexicographically by the join key. If False,
    the order of the join key depends on the join type (how keyword).

Returns
-------
DataFrame
    A dataframe containing columns from both the caller and `other`.

See Also
--------
DataFrame.merge : For column(s)-on-columns(s) operations.

Notes
-----
Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
passing a list of `DataFrame` objects.

Support for specifying index levels as the `on` parameter was added
in version 0.23.0.

Examples
--------
>>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
...                    'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})

>>> df
  key   A
0  K0  A0
1  K1  A1
2  K2  A2
3  K3  A3
4  K4  A4
5  K5  A5

>>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
...                       'B': ['B0', 'B1', 'B2']})

>>> other
  key   B
0  K0  B0
1  K1  B1
2  K2  B2

Join DataFrames using their indexes.

>>> df.join(other, lsuffix='_caller', rsuffix='_other')
  key_caller   A key_other    B
0         K0  A0        K0   B0
1         K1  A1        K1   B1
2         K2  A2        K2   B2
3         K3  A3       NaN  NaN
4         K4  A4       NaN  NaN
5         K5  A5       NaN  NaN

If we want to join using the key columns, we need to set key to be
the index in both `df` and `other`. The joined DataFrame will have
key as its index.

>>> df.set_index('key').join(other.set_index('key'))
      A    B
key
K0   A0   B0
K1   A1   B1
K2   A2   B2
K3   A3  NaN
K4   A4  NaN
K5   A5  NaN

Another option to join using the key columns is to use the `on`
parameter. DataFrame.join always uses `other`'s index but we can use
any column in `df`. This method preserves the original DataFrame's
index in the result.

>>> df.join(other.set_index('key'), on='key')
  key   A    B
0  K0  A0   B0
1  K1  A1   B1
2  K2  A2   B2
3  K3  A3  NaN
4  K4  A4  NaN
5  K5  A5  NaN
File:      c:\users\user\anaconda3\lib\site-packages\pandas\core\frame.py
Type:      method
for column in categorial_cols:
    # cataegorical variable 을 dummy변수로 바꾸어준다
    dummies = pd.get_dummies(data_cp[column])
    dummies = dummies.add_prefix("{}#".format(column))
    data_cp.drop(column, axis=1, inplace=True)
    # join 은 두 데이터 프레임을 합친다
    data_cp = data_cp.join(dummies)
data_cp
playoffs shot_distance last_5_sec_in_period home_away shot_zone_basic#Above the Break 3 shot_zone_basic#Backcourt shot_zone_basic#In The Paint (Non-RA) shot_zone_basic#Left Corner 3 shot_zone_basic#Mid-Range shot_zone_basic#Restricted Area shot_zone_basic#Right Corner 3 season#1996-97 season#1997-98 season#1998-99 season#1999-00 season#2000-01 season#2001-02 season#2002-03 season#2003-04 season#2004-05 season#2005-06 season#2006-07 season#2007-08 season#2008-09 season#2009-10 season#2010-11 season#2011-12 season#2012-13 season#2013-14 season#2014-15 season#2015-16 shot_zone_area#Back Court(BC) shot_zone_area#Center(C) shot_zone_area#Left Side Center(LC) shot_zone_area#Left Side(L) shot_zone_area#Right Side Center(RC) shot_zone_area#Right Side(R) game_year#1996 game_year#1997 game_year#1998 game_year#1999 game_year#2000 game_year#2001 game_year#2002 game_year#2003 game_year#2004 game_year#2005 game_year#2006 game_year#2007 game_year#2008 game_year#2009 game_year#2010 game_year#2011 game_year#2012 game_year#2013 game_year#2014 game_year#2015 game_year#2016 shot_type#2PT Field Goal shot_type#3PT Field Goal game_month#1 game_month#2 game_month#3 game_month#4 game_month#5 game_month#6 game_month#10 game_month#11 game_month#12 loc_x#(-250.498, -230.08] loc_x#(-230.08, -210.16] loc_x#(-210.16, -190.24] loc_x#(-190.24, -170.32] loc_x#(-170.32, -150.4] loc_x#(-150.4, -130.48] loc_x#(-130.48, -110.56] loc_x#(-110.56, -90.64] loc_x#(-90.64, -70.72] loc_x#(-70.72, -50.8] loc_x#(-50.8, -30.88] loc_x#(-30.88, -10.96] loc_x#(-10.96, 8.96] loc_x#(8.96, 28.88] loc_x#(28.88, 48.8] loc_x#(48.8, 68.72] loc_x#(68.72, 88.64] loc_x#(88.64, 108.56] loc_x#(108.56, 128.48] loc_x#(128.48, 148.4] loc_x#(148.4, 168.32] loc_x#(168.32, 188.24] loc_x#(188.24, 208.16] loc_x#(208.16, 228.08] loc_x#(228.08, 248.0] combined_shot_type#Bank Shot combined_shot_type#Dunk combined_shot_type#Hook Shot combined_shot_type#Jump Shot combined_shot_type#Layup combined_shot_type#Tip Shot loc_y#(-44.835, -10.6] loc_y#(-10.6, 22.8] loc_y#(22.8, 56.2] loc_y#(56.2, 89.6] loc_y#(89.6, 123.0] loc_y#(123.0, 156.4] loc_y#(156.4, 189.8] loc_y#(189.8, 223.2] loc_y#(223.2, 256.6] loc_y#(256.6, 290.0] loc_y#(290.0, 323.4] loc_y#(323.4, 356.8] loc_y#(356.8, 390.2] loc_y#(390.2, 423.6] loc_y#(423.6, 457.0] loc_y#(457.0, 490.4] loc_y#(490.4, 523.8] loc_y#(523.8, 557.2] loc_y#(557.2, 590.6] loc_y#(590.6, 624.0] loc_y#(624.0, 657.4] loc_y#(657.4, 690.8] loc_y#(690.8, 724.2] loc_y#(724.2, 757.6] loc_y#(757.6, 791.0] period#1 period#2 period#3 period#4 period#5 period#6 period#7 opponent#ATL opponent#BKN opponent#BOS opponent#CHA opponent#CHI opponent#CLE opponent#DAL opponent#DEN opponent#DET opponent#GSW opponent#HOU opponent#IND opponent#LAC opponent#MEM opponent#MIA opponent#MIL opponent#MIN opponent#NJN opponent#NOH opponent#NOP opponent#NYK opponent#OKC opponent#ORL opponent#PHI opponent#PHX opponent#POR opponent#SAC opponent#SAS opponent#SEA opponent#TOR opponent#UTA opponent#VAN opponent#WAS action_type#Alley Oop Dunk Shot action_type#Alley Oop Layup shot action_type#Driving Dunk Shot action_type#Driving Finger Roll Layup Shot action_type#Driving Finger Roll Shot action_type#Driving Jump shot action_type#Driving Layup Shot action_type#Driving Reverse Layup Shot action_type#Driving Slam Dunk Shot action_type#Dunk Shot action_type#Fadeaway Bank shot action_type#Fadeaway Jump Shot action_type#Finger Roll Layup Shot action_type#Finger Roll Shot action_type#Floating Jump shot action_type#Follow Up Dunk Shot action_type#Hook Shot action_type#Jump Bank Shot action_type#Jump Hook Shot action_type#Jump Shot action_type#Layup Shot action_type#Other action_type#Pullup Jump shot action_type#Putback Layup Shot action_type#Reverse Dunk Shot action_type#Reverse Layup Shot action_type#Reverse Slam Dunk Shot action_type#Running Bank shot action_type#Running Dunk Shot action_type#Running Hook Shot action_type#Running Jump Shot action_type#Running Layup Shot action_type#Slam Dunk Shot action_type#Step Back Jump shot action_type#Tip Shot action_type#Turnaround Bank shot action_type#Turnaround Fadeaway shot action_type#Turnaround Jump Shot shot_zone_range#16-24 ft. shot_zone_range#24+ ft. shot_zone_range#8-16 ft. shot_zone_range#Back Court Shot shot_zone_range#Less Than 8 ft.
shot_id
1 0 18 False 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
2 0 15 False 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
3 0 16 False 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
4 0 22 False 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
5 0 0 False 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
30693 1 4 False 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
30694 1 0 False 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
30695 1 21 False 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0
30696 1 26 False 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
30697 1 7 False 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1

30697 rows × 208 columns

중요 컬럼을 뽑아내기

unknown_mask = data['shot_made_flag'].isnull()
data_cp.head(1)
playoffs shot_distance last_5_sec_in_period home_away shot_zone_basic#Above the Break 3 shot_zone_basic#Backcourt shot_zone_basic#In The Paint (Non-RA) shot_zone_basic#Left Corner 3 shot_zone_basic#Mid-Range shot_zone_basic#Restricted Area shot_zone_basic#Right Corner 3 season#1996-97 season#1997-98 season#1998-99 season#1999-00 season#2000-01 season#2001-02 season#2002-03 season#2003-04 season#2004-05 season#2005-06 season#2006-07 season#2007-08 season#2008-09 season#2009-10 season#2010-11 season#2011-12 season#2012-13 season#2013-14 season#2014-15 season#2015-16 shot_zone_area#Back Court(BC) shot_zone_area#Center(C) shot_zone_area#Left Side Center(LC) shot_zone_area#Left Side(L) shot_zone_area#Right Side Center(RC) shot_zone_area#Right Side(R) game_year#1996 game_year#1997 game_year#1998 game_year#1999 game_year#2000 game_year#2001 game_year#2002 game_year#2003 game_year#2004 game_year#2005 game_year#2006 game_year#2007 game_year#2008 game_year#2009 game_year#2010 game_year#2011 game_year#2012 game_year#2013 game_year#2014 game_year#2015 game_year#2016 shot_type#2PT Field Goal shot_type#3PT Field Goal game_month#1 game_month#2 game_month#3 game_month#4 game_month#5 game_month#6 game_month#10 game_month#11 game_month#12 loc_x#(-250.498, -230.08] loc_x#(-230.08, -210.16] loc_x#(-210.16, -190.24] loc_x#(-190.24, -170.32] loc_x#(-170.32, -150.4] loc_x#(-150.4, -130.48] loc_x#(-130.48, -110.56] loc_x#(-110.56, -90.64] loc_x#(-90.64, -70.72] loc_x#(-70.72, -50.8] loc_x#(-50.8, -30.88] loc_x#(-30.88, -10.96] loc_x#(-10.96, 8.96] loc_x#(8.96, 28.88] loc_x#(28.88, 48.8] loc_x#(48.8, 68.72] loc_x#(68.72, 88.64] loc_x#(88.64, 108.56] loc_x#(108.56, 128.48] loc_x#(128.48, 148.4] loc_x#(148.4, 168.32] loc_x#(168.32, 188.24] loc_x#(188.24, 208.16] loc_x#(208.16, 228.08] loc_x#(228.08, 248.0] combined_shot_type#Bank Shot combined_shot_type#Dunk combined_shot_type#Hook Shot combined_shot_type#Jump Shot combined_shot_type#Layup combined_shot_type#Tip Shot loc_y#(-44.835, -10.6] loc_y#(-10.6, 22.8] loc_y#(22.8, 56.2] loc_y#(56.2, 89.6] loc_y#(89.6, 123.0] loc_y#(123.0, 156.4] loc_y#(156.4, 189.8] loc_y#(189.8, 223.2] loc_y#(223.2, 256.6] loc_y#(256.6, 290.0] loc_y#(290.0, 323.4] loc_y#(323.4, 356.8] loc_y#(356.8, 390.2] loc_y#(390.2, 423.6] loc_y#(423.6, 457.0] loc_y#(457.0, 490.4] loc_y#(490.4, 523.8] loc_y#(523.8, 557.2] loc_y#(557.2, 590.6] loc_y#(590.6, 624.0] loc_y#(624.0, 657.4] loc_y#(657.4, 690.8] loc_y#(690.8, 724.2] loc_y#(724.2, 757.6] loc_y#(757.6, 791.0] period#1 period#2 period#3 period#4 period#5 period#6 period#7 opponent#ATL opponent#BKN opponent#BOS opponent#CHA opponent#CHI opponent#CLE opponent#DAL opponent#DEN opponent#DET opponent#GSW opponent#HOU opponent#IND opponent#LAC opponent#MEM opponent#MIA opponent#MIL opponent#MIN opponent#NJN opponent#NOH opponent#NOP opponent#NYK opponent#OKC opponent#ORL opponent#PHI opponent#PHX opponent#POR opponent#SAC opponent#SAS opponent#SEA opponent#TOR opponent#UTA opponent#VAN opponent#WAS action_type#Alley Oop Dunk Shot action_type#Alley Oop Layup shot action_type#Driving Dunk Shot action_type#Driving Finger Roll Layup Shot action_type#Driving Finger Roll Shot action_type#Driving Jump shot action_type#Driving Layup Shot action_type#Driving Reverse Layup Shot action_type#Driving Slam Dunk Shot action_type#Dunk Shot action_type#Fadeaway Bank shot action_type#Fadeaway Jump Shot action_type#Finger Roll Layup Shot action_type#Finger Roll Shot action_type#Floating Jump shot action_type#Follow Up Dunk Shot action_type#Hook Shot action_type#Jump Bank Shot action_type#Jump Hook Shot action_type#Jump Shot action_type#Layup Shot action_type#Other action_type#Pullup Jump shot action_type#Putback Layup Shot action_type#Reverse Dunk Shot action_type#Reverse Layup Shot action_type#Reverse Slam Dunk Shot action_type#Running Bank shot action_type#Running Dunk Shot action_type#Running Hook Shot action_type#Running Jump Shot action_type#Running Layup Shot action_type#Slam Dunk Shot action_type#Step Back Jump shot action_type#Tip Shot action_type#Turnaround Bank shot action_type#Turnaround Fadeaway shot action_type#Turnaround Jump Shot shot_zone_range#16-24 ft. shot_zone_range#24+ ft. shot_zone_range#8-16 ft. shot_zone_range#Back Court Shot shot_zone_range#Less Than 8 ft.
shot_id
1 0 18 False 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
# 제출할 때 사용할 데이터 분리 (즉, 예측해야 하는 데이터)
data_submit = data_cp[unknown_mask]
Y = target[-unknown_mask]
X = data_cp
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier,VotingClassifier, RandomForestClassifier, AdaBoostClassifier

분산을 이용

# 분산이 낮은 feature들을 제거해주자
threshold = 0.90
vt = VarianceThreshold().fit(X)

# Find feature names
feature_var_threshold = data_cp.columns[vt.variances_>threshold*(1-threshold)]
feature_var_threshold
Index(['playoffs', 'shot_distance', 'home_away',
       'shot_zone_basic#Above the Break 3',
       'shot_zone_basic#In The Paint (Non-RA)', 'shot_zone_basic#Mid-Range',
       'shot_zone_basic#Restricted Area', 'shot_zone_area#Center(C)',
       'shot_zone_area#Left Side Center(LC)', 'shot_zone_area#Left Side(L)',
       'shot_zone_area#Right Side Center(RC)', 'shot_zone_area#Right Side(R)',
       'shot_type#2PT Field Goal', 'shot_type#3PT Field Goal', 'game_month#1',
       'game_month#2', 'game_month#3', 'game_month#4', 'game_month#11',
       'game_month#12', 'loc_x#(-10.96, 8.96]', 'combined_shot_type#Jump Shot',
       'combined_shot_type#Layup', 'loc_y#(-10.6, 22.8]', 'loc_y#(22.8, 56.2]',
       'loc_y#(123.0, 156.4]', 'period#1', 'period#2', 'period#3', 'period#4',
       'action_type#Jump Shot', 'shot_zone_range#16-24 ft.',
       'shot_zone_range#24+ ft.', 'shot_zone_range#8-16 ft.',
       'shot_zone_range#Less Than 8 ft.'],
      dtype='object')

RandomForestClassifier 사용

X =X[-unknown_mask]

model = RandomForestClassifier()

model.fit(X,Y)

feature_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['importance'])
feature_imp = feature_imp.sort_values("importance", ascending= False).head(20).index
feature_imp
Index(['shot_distance', 'action_type#Jump Shot', 'home_away', 'period#3',
       'period#1', 'period#2', 'period#4', 'action_type#Layup Shot',
       'game_month#3', 'game_month#1', 'game_month#4', 'game_month#12',
       'game_month#2', 'combined_shot_type#Dunk', 'game_month#11', 'playoffs',
       'loc_x#(-10.96, 8.96]', 'action_type#Driving Layup Shot',
       'opponent#SAS', 'combined_shot_type#Jump Shot'],
      dtype='object')
np.hstack?
Stack arrays in sequence horizontally (column wise).

This is equivalent to concatenation along the second axis, except for 1-D
arrays where it concatenates along the first axis. Rebuilds arrays divided
by `hsplit`.

This function makes most sense for arrays with up to 3 dimensions. For
instance, for pixel-data with a height (first axis), width (second axis),
and r/g/b channels (third axis). The functions `concatenate`, `stack` and
`block` provide more general stacking and concatenation operations.

Parameters
----------
tup : sequence of ndarrays
    The arrays must have the same shape along all but the second axis,
    except 1-D arrays which can be any length.

Returns
-------
stacked : ndarray
    The array formed by stacking the given arrays.

See Also
--------
stack : Join a sequence of arrays along a new axis.
vstack : Stack arrays in sequence vertically (row wise).
dstack : Stack arrays in sequence depth wise (along third axis).
concatenate : Join a sequence of arrays along an existing axis.
hsplit : Split array along second axis.
block : Assemble arrays from blocks.

Examples
--------
>>> a = np.array((1,2,3))
>>> b = np.array((2,3,4))
>>> np.hstack((a,b))
array([1, 2, 3, 2, 3, 4])
>>> a = np.array([[1],[2],[3]])
>>> b = np.array([[2],[3],[4]])
>>> np.hstack((a,b))
array([[1, 2],
       [2, 3],
       [3, 4]])
File:      c:\users\user\anaconda3\lib\site-packages\numpy\core\shape_base.py
Type:      function
# 위에서 구한 feature_var_threshold와 feature_imp를 조합하여 중요한 feature를 가져온다

features = np.hstack([feature_var_threshold,feature_imp])

features = np.unique(features)
print('final feature')

for f in features:
    print('\t-{}'.format(f))
final feature
	-action_type#Driving Layup Shot
	-action_type#Jump Shot
	-action_type#Layup Shot
	-combined_shot_type#Dunk
	-combined_shot_type#Jump Shot
	-combined_shot_type#Layup
	-game_month#1
	-game_month#11
	-game_month#12
	-game_month#2
	-game_month#3
	-game_month#4
	-home_away
	-loc_x#(-10.96, 8.96]
	-loc_y#(-10.6, 22.8]
	-loc_y#(123.0, 156.4]
	-loc_y#(22.8, 56.2]
	-opponent#SAS
	-period#1
	-period#2
	-period#3
	-period#4
	-playoffs
	-shot_distance
	-shot_type#2PT Field Goal
	-shot_type#3PT Field Goal
	-shot_zone_area#Center(C)
	-shot_zone_area#Left Side Center(LC)
	-shot_zone_area#Left Side(L)
	-shot_zone_area#Right Side Center(RC)
	-shot_zone_area#Right Side(R)
	-shot_zone_basic#Above the Break 3
	-shot_zone_basic#In The Paint (Non-RA)
	-shot_zone_basic#Mid-Range
	-shot_zone_basic#Restricted Area
	-shot_zone_range#16-24 ft.
	-shot_zone_range#24+ ft.
	-shot_zone_range#8-16 ft.
	-shot_zone_range#Less Than 8 ft.
data_cp = data_cp.loc[:, features]
data_submit = data_submit.loc[:, features]
X = X.loc[:, features]

print('Clean dataset shape: {}'.format(data_cp.shape))
print('Subbmitable dataset shape:{}'.format(data_submit.shape))
print('Train features shape:{}'.format(X.shape))
print('Target label shape:{}'.format(Y.shape))
Clean dataset shape: (30697, 39)
Subbmitable dataset shape:(5000, 39)
Train features shape:(25697, 39)
Target label shape:(25697,)

PCA 방법

components = 8
pca = PCA(n_components=components).fit(X)
pca_variance_explained_df = pd.DataFrame({
    "components" : np.arange(1, components + 1),
    "variance_explained" : pca.explained_variance_ratio_
})

ax = sns.barplot(x= 'components', y = 'variance_explained', data=pca_variance_explained_df)
ax.set_title("PCA-Variance explained")
plt.show()

농구_68_0

모델링 및 평가하기

X.head(3)
action_type#Driving Layup Shot action_type#Jump Shot action_type#Layup Shot combined_shot_type#Dunk combined_shot_type#Jump Shot combined_shot_type#Layup game_month#1 game_month#11 game_month#12 game_month#2 game_month#3 game_month#4 home_away loc_x#(-10.96, 8.96] loc_y#(-10.6, 22.8] loc_y#(123.0, 156.4] loc_y#(22.8, 56.2] opponent#SAS period#1 period#2 period#3 period#4 playoffs shot_distance shot_type#2PT Field Goal shot_type#3PT Field Goal shot_zone_area#Center(C) shot_zone_area#Left Side Center(LC) shot_zone_area#Left Side(L) shot_zone_area#Right Side Center(RC) shot_zone_area#Right Side(R) shot_zone_basic#Above the Break 3 shot_zone_basic#In The Paint (Non-RA) shot_zone_basic#Mid-Range shot_zone_basic#Restricted Area shot_zone_range#16-24 ft. shot_zone_range#24+ ft. shot_zone_range#8-16 ft. shot_zone_range#Less Than 8 ft.
shot_id
2 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 15 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0
3 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 16 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0
4 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 22 1 0 0 0 0 1 0 0 0 1 0 1 0 0 0
X.shape
(25697, 39)
Y.head(3)
shot_id
2    0.0
3    1.0
4    0.0
Name: shot_made_flag, dtype: category
Categories (2, float64): [0.0, 1.0]
Y.shape
(25697,)
## 각 알고리즘 및 함수의 configuration

seed=7
processors=1
num_folds=5
num_instance=len(X)
scoring='neg_log_loss'

k_fold = KFold(n_splits=num_folds, random_state = seed)

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
# knn은 근처에 몇개를 볼건지 알려줘야한다
models.append(('kNN',KNeighborsClassifier(n_neighbors=5)))
models.append(('CART',DecisionTreeClassifier()))

results = []
names = []

for name, model in models:
    cv_results = cross_val_score(model, X, Y, cv=k_fold, scoring = scoring, n_jobs=processors)
    results.append(cv_results)
    names.append(name)
    
    print('{}: {} +/- {}'.format(name, cv_results.mean(), cv_results.std()))
LR: -0.6188193218725694 +/- 0.004363985247902272
LDA: -0.6200222598834624 +/- 0.005268337905436931
kNN: -2.0409667268360083 +/- 0.1992975377549342
CART: -9.54050070397854 +/- 0.760930167111565
import sklearn
sklearn.metrics.SCORERS.keys()
dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

Ensemble(앙상블) 모형

  • 다양한 모형의 예측 결과를 결합하여 사용하는 모형(여러 알고리즘을 결합하여 사용)
  • 각각의 알고리즘은 상호 독립적이어야 하고, 오류 분류율은 적어도 50% 이상

Bagging

  • bootstrap Aggregation
    • 통계적 분류와 회귀 분석에서 사용되는 기계학습 알고리즘의 안정성과 정확도를 향상시키기 위한 앙상블 학습법의 알고리즘
    • 분산을 줄이고, overfitting을 피하도록 해준다.
    • 주로, Decision Tree와 RandomForest에 적용

Decision Tree

cart = DecisionTreeClassifier()
num_trees = 100

model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)

results = cross_val_score(model, X, Y, cv=k_fold, scoring = scoring, n_jobs=processors)
print('{} +/- {}'.format(results.mean(), results.std()))
-0.9178356241773005 +/- 0.02955286001580283

RandomForest

  • Bagging 종류 알고리즘 중 하나로 성능이 좋은 편
num_trees = 100
num_features = 10

model = RandomForestClassifier(n_estimators=num_trees, max_features=num_features)

results = cross_val_score(model, X, Y, cv=k_fold, scoring = scoring, n_jobs=processors)
print('{} +/- {}'.format(results.mean(), results.std()))
-0.9050138346040715 +/- 0.036967097078068625

Voting

estimators = []
# 여러 모델을 학습시키고 조합

estimators.append(('lr', LogisticRegression()))
estimators.append(('LDA',LinearDiscriminantAnalysis()))

ensemble = VotingClassifier(estimators, voting='soft', weights=[6,4])
results = cross_val_score(ensemble, X, Y, scoring=scoring, n_jobs=processors)
print('{} +/- {}'.format(results.mean(), results.std()))
-0.6195021443080785 +/- 0.007836031395028803
model = ensemble
import datetime
model.fit(X, Y)
preds = model.predict_proba(data_submit)

submission = pd.DataFrame()
submission['shot_id'] = data_submit.index
submission['shot_made_flag'] = preds[:,0]

submission.to_csv('sub_now.csv', index=False)

점수 높여보기

# 평가함수를 미리 만들어 본다
test_data = train[['loc_x', 'loc_y','shot_made_flag']]

test_data = test_data.dropna()

def test_it(data):
    clf = RandomForestClassifier(n_jobs=-1)
    return cross_val_score(clf, data.drop('shot_made_flag',1),data.shot_made_flag, scoring = 'neg_log_loss', cv = 10)
test_it(test_data).mean()
-0.8847544750995745
data = train[['loc_y','shot_made_flag']]
data = data.dropna()

test_it(data).mean()
# feature끼리 조합이 좋아야 학습이 잘된다
-0.6905886020233778
data = train[['shot_distance','shot_made_flag']]
data = data.dropna()

test_it(data).mean()
-0.6713330908479926
data = train[['shot_distance','loc_y','shot_made_flag']]
data = data.dropna()

test_it(data).mean()
-0.984015188449691
data = train[['seconds_remaining','shot_distance','shot_made_flag']]
data = data.dropna()

test_it(data).mean()
-0.8162567982043301
print(train.season.unique())
[2000-01, 2001-02, 2002-03, 2003-04, 2004-05, ..., 2015-16, 1996-97, 1997-98, 1998-99, 1999-00]
Length: 20
Categories (20, object): [2000-01, 2001-02, 2002-03, 2003-04, ..., 1996-97, 1997-98, 1998-99, 1999-00]
train['season_start_year'] = train.season.str.split('-').str[0]
train['seacon_start_year'] = train['season_start_year'].astype(int)
data = train[['seacon_start_year','shot_distance','shot_made_flag']]
data = data.dropna()

test_it(data).mean()
-1.2204021155277265
action_map = {action: i for i , action in enumerate(train.action_type.unique())}
train['action_type_enumerated'] = train.action_type.map(action_map)
train['action_type_enumerated']
shot_id
2        0
3        0
4        0
5        1
6        0
        ..
30692    6
30693    0
30695    3
30696    0
30697    0
Name: action_type_enumerated, Length: 25697, dtype: int64
data = train[['action_type_enumerated','shot_distance','shot_made_flag']]
data = data.dropna()

test_it(data).mean()
-0.6497893336065241
data = train[['action_type_enumerated','shot_distance','shot_made_flag']].dropna()
estimators, scores = list(range(1,100,5)), []

for i in estimators:
    clf = RandomForestClassifier(n_jobs=-1, n_estimators=i, random_state=2016)
    x = cross_val_score(clf, data.drop(['shot_made_flag'],1), data.shot_made_flag, scoring='neg_log_loss', cv = 10)
    scores.append(x)
    
x = [i for i in estimators for j in range(10)]
sns.boxplot(x, np.array(scores).flatten())
<matplotlib.axes._subplots.AxesSubplot at 0x179001e5688>

농구_102_1

depth, scores = list(range(1,20,1)), []

for i in depth:
    clf = RandomForestClassifier(n_jobs=-1, n_estimators=70,max_depth=i, random_state=2016)
    x = cross_val_score(clf, data.drop(['shot_made_flag'],1), data.shot_made_flag, scoring='neg_log_loss', cv = 10)
    scores.append(x)
    
x = [i for i in depth for j in range(10)]
sns.boxplot(x, np.array(scores).flatten())
<matplotlib.axes._subplots.AxesSubplot at 0x179019cf908>

농구_103_1

clf = RandomForestClassifier(n_jobs=-1, n_estimators=70, max_depth=7, random_state=2016) # a more powerful classifier

train_data = train.loc[~train.shot_made_flag.isnull(), ['action_type_enumerated',
                                             'shot_distance', 'shot_made_flag', 'away']]
test = train.loc[train.shot_made_flag.isnull(), ['action_type_enumerated',
                                           'shot_distance', 'shot_id', 'away']]
# Impute
mode = test.action_type_enumerated.mode()[0]
test.action_type_enumerated.fillna(mode, inplace=True)

# Train and predict
clf.fit(train_data.drop('shot_made_flag', 1), train_data.shot_made_flag)
predictions = clf.predict_proba(test.drop('shot_id', 1))

import datetime
submission = pd.DataFrame({'shot_id': test.shot_id,
                           'shot_made_flag': predictions[:, 1]})
submission[['shot_id', 'shot_made_flag']].to_csv('submission{}.csv'.format(datetime.datetime.now()), index=False)

Leave a comment