Jupyter Snippet CB2nd 04_correlation
Jupyter Snippet CB2nd 04_correlation
7.4. Estimating the correlation between two variables with a contingency table and a chi-squared test
import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
%matplotlib inline
player = 'Roger Federer'
df = pd.read_csv('https://github.com/ipython-books/'
'cookbook-2nd-data/blob/master/'
'federer.csv?raw=true',
parse_dates=['start date'],
dayfirst=True)
print(f"Number of columns: {len(df.columns)}")
df[df.columns[:4]].tail()
Number of columns: 70
npoints = df['player1 total points total']
points = df['player1 total points won'] / npoints
aces = df['player1 aces'] / npoints
fig, ax = plt.subplots(1, 1)
ax.plot(points, aces, '.')
ax.set_xlabel('% of points won')
ax.set_ylabel('% of aces')
ax.set_xlim(0., 1.)
ax.set_ylim(0.)
df_bis = pd.DataFrame({'points': points,
'aces': aces}).dropna()
df_bis.tail()
df_bis.corr()
df_bis['result'] = (df_bis['points'] >
df_bis['points'].median())
df_bis['manyaces'] = (df_bis['aces'] >
df_bis['aces'].median())
pd.crosstab(df_bis['result'], df_bis['manyaces'])
st.chi2_contingency(_)
(2.780e+01, 1.338e-07, 1,
array([[ 257.250, 256.749],
[ 256.749, 256.250]]))