import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
random.seed(42)
df = pd.read_csv('ab_data.csv')
df.head()
def proportions_ztest(count, nobs, value=None, alternative='two-sided',
prop_var=False):
count = np.asarray(count)
nobs = np.asarray(nobs)
if nobs.size == 1:
nobs = nobs * np.ones_like(count)
print('nobs.size == 1')
prop = count * 1. / nobs
print('prop: ' + str(prop))
k_sample = np.size(prop)
print('k_sample: ' + str(k_sample))
if value is None:
if k_sample == 1:
raise ValueError('value must be provided for a 1-sample test')
value = 0
if k_sample == 1:
diff = prop - value
print('k_sample == 1 and diff: ' + str(diff))
elif k_sample == 2:
diff = prop[0] - prop[1] - value
print('k_sample == 2 and diff: ' + str(diff))
else:
msg = 'more than two samples are not implemented yet'
raise NotImplementedError(msg)
p_pooled = np.sum(count) * 1. / np.sum(nobs)
print('p_pooled: ' + str(p_pooled))
nobs_fact = np.sum(1. / nobs)
print('nobs_fact: ' + str(nobs_fact))
print('prop_var: ' + str(prop_var))
if prop_var:
p_pooled = prop_var
var_ = p_pooled * (1 - p_pooled) * nobs_fact
print('var_: ' + str(var_))
std_diff = np.sqrt(var_)
print('std_diff: ' + str(std_diff))
return _zstat_generic2(diff, std_diff, alternative)
def _zstat_generic2(value, std_diff, alternative):
'''generic (normal) z-test to save typing
can be used as ztest based on summary statistics
'''
zstat = value / std_diff
print('zstat: ' + str(zstat))
if alternative in ['two-sided', '2-sided', '2s']:
pvalue = stats.norm.sf(np.abs(zstat))*2
print('alternative is two-sided.')
print('pvalue: ' + str(pvalue))
elif alternative in ['larger', 'l']:
pvalue = stats.norm.sf(zstat)
elif alternative in ['smaller', 's']:
pvalue = stats.norm.cdf(zstat)
else:
raise ValueError('invalid alternative')
return zstat, pvalue
df['group'].unique()
df['landing_page'].unique()
len(df)
df['user_id'].nunique()
sum(df['converted'] == 1) / len(df['converted'])
treatment_wrong = sum((df['group'] == 'treatment') & (df['landing_page'] != 'new_page'))
control_wrong = sum((df['group'] == 'control') & (df['landing_page'] == 'new_page'))
total_wrong = treatment_wrong + control_wrong
total_wrong
df.info()
No rows appear to be missing any values.
df2 = df
df2.info()
df2 = df2.loc[~((df2['group'] == 'treatment') & (df2['landing_page'] != 'new_page'))]
df2 = df2.loc[~((df2['group'] == 'control') & (df2['landing_page'] == 'new_page'))]
df2.shape
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]
df2['user_id'].nunique()
df2.loc[df2['user_id'].duplicated()]
df2.loc[df2['user_id'].duplicated()]
df2.drop(2893, inplace = True);
df2['user_id'].duplicated().value_counts()
conv_prob = sum(df2['converted'] == 1) / len(df2['converted'])
conv_prob
control_group = df2.loc[df2['group'] == 'control']
control_conv_prob = sum(control_group['converted'] == 1) / len(control_group['converted'])
control_conv_prob
treatment_group = df2.loc[df2['group'] == 'treatment']
treatment_conv_prob = sum(treatment_group['converted'] == 1) / len(treatment_group['converted'])
treatment_conv_prob
len(treatment_group) / len(df2)
actual_prob_diff = control_conv_prob - treatment_conv_prob
actual_prob_diff
There appears to be a 0.00158 higher probability of a conversion from a control page than a conversion from a treatment page in this dataset.
$H_{0}: P_{new} - P_{old} \leq 0$
$H_{1}: P_{new} - P_{old} > 0$
$\alpha: 0.05$
conv_prob
n_new = len(treatment_group)
n_new
n_old = len(control_group)
n_old
new_data_sample = treatment_group.sample(n_new, replace = True)
new_page_converted = new_data_sample['converted'].tolist()
p_new_conv_rate = sum(new_page_converted) / len(new_page_converted)
p_new_conv_rate
old_data_sample = control_group.sample(n_old)
old_page_converted = old_data_sample['converted'].tolist()
p_old_conv_rate = sum(old_page_converted) / len(old_page_converted)
p_old_conv_rate
ef_diff = p_new_conv_rate - p_old_conv_rate
ef_diff
p_diffs = np.empty(0)
for x in range(10000):
new_data_sample = treatment_group.sample(1000, replace = True)
new_page_converted = new_data_sample['converted'].tolist()
new_conv_prob = sum(new_page_converted) / len(new_page_converted)
old_data_sample = control_group.sample(1000, replace = True)
old_page_converted = old_data_sample['converted'].tolist()
old_conv_prob = sum(old_page_converted) / len(old_page_converted)
difference = new_conv_prob - old_conv_prob
p_diffs = np.append(p_diffs, difference)
plt.hist(p_diffs)
greater = [i for i in p_diffs if i > actual_prob_diff]
print('len(greater): ' + str(len(greater)))
print('len(p_diffs): ' + str(len(p_diffs)))
greater_prop = len(greater) / len(p_diffs)
greater_prop
The p-value is the probability of observing the statistic, or one that is more extreme in favor of the alternative, if the null hypothesis is true. That means that if the p-value is smaller than $\alpha$, then the alternative hypothesis is more likely, and if the p-value is larger than $\alpha$, then the null hypothesis is more likely.
import statsmodels.api as sm
convert_old = sum(control_group['converted'] == 1)
convert_new = sum(treatment_group['converted'] == 1)
n_old = n_old
n_new = n_new
conv_diff = (convert_old / n_old) - (convert_new / n_new)
num_diff = n_new - n_old
conv_diff, num_diff
z_score, p_value = proportions_ztest([convert_new, convert_old], [n_new, n_old])
z_score, p_value
The p-value is greater than the alpha of 0.05, so we fail to reject the null hypothesis according to this.
This does not agree with the findings of the earlier findings.
df2['intercept'] = 1
df2['ab_page'] = 0
df2.loc[df2['group'] == 'treatment', 'ab_page'] = 1
df2.head(20)
log_mod = sm.Logit(df2['converted'], df2[['intercept', 'ab_page']])
results = log_mod.fit()
results.summary()
P-value: 0.190 It appears to be rounded to the thousandth place.
countries_df = pd.read_csv('countries.csv')
countries_df.head()
df3 = pd.DataFrame.join(df2, countries_df['country'])
df3.head()
df3['country'].value_counts()
df3[['CA','UK', 'US']] = pd.get_dummies(df3['country'])
df3 = df3.drop('US', axis=1)
df3.head()
log_mod2 = sm.Logit(df3['converted'], df3[['intercept', 'ab_page', 'CA', 'UK']])
results2 = log_mod2.fit()
results2.summary()
Given that the country p-values are higher than 0.05, the country does not appear to have an impact of conversion.