Analyze A/B Test Results

Matthew Unrue, Fall 2018

Udacity Data Analyst Nanodegree Project 3

Part I - Probability

In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
random.seed(42)
In [2]:
df = pd.read_csv('ab_data.csv')
df.head()
Out[2]:
user_id timestamp group landing_page converted
0 851104 2017-01-21 22:11:48.556739 control old_page 0
1 804228 2017-01-12 08:01:45.159739 control old_page 0
2 661590 2017-01-11 16:55:06.154213 treatment new_page 0
3 853541 2017-01-08 18:28:03.143765 treatment new_page 0
4 864975 2017-01-21 01:52:26.210827 control old_page 1
In [3]:
def proportions_ztest(count, nobs, value=None, alternative='two-sided',
                      prop_var=False):

    count = np.asarray(count)
    nobs = np.asarray(nobs)

    if nobs.size == 1:
        nobs = nobs * np.ones_like(count)
        print('nobs.size == 1')

    prop = count * 1. / nobs
    print('prop: ' + str(prop))
    k_sample = np.size(prop)
    print('k_sample: ' + str(k_sample))
    if value is None:
        if k_sample == 1:
            raise ValueError('value must be provided for a 1-sample test')
        value = 0
    if k_sample == 1:
        diff = prop - value
        print('k_sample == 1 and diff: ' + str(diff))
    elif k_sample == 2:
        diff = prop[0] - prop[1] - value
        print('k_sample == 2 and diff: ' + str(diff))
    else:
        msg = 'more than two samples are not implemented yet'
        raise NotImplementedError(msg)

    p_pooled = np.sum(count) * 1. / np.sum(nobs)
    print('p_pooled: ' + str(p_pooled))

    nobs_fact = np.sum(1. / nobs)
    print('nobs_fact: ' + str(nobs_fact))
    print('prop_var: ' + str(prop_var))
    if prop_var:
        p_pooled = prop_var
    var_ = p_pooled * (1 - p_pooled) * nobs_fact
    print('var_: ' + str(var_))
    std_diff = np.sqrt(var_)
    print('std_diff: ' + str(std_diff))
    
    return _zstat_generic2(diff, std_diff, alternative)
In [4]:
def _zstat_generic2(value, std_diff, alternative):
    '''generic (normal) z-test to save typing

    can be used as ztest based on summary statistics
    '''
    zstat = value / std_diff
    print('zstat: ' + str(zstat))
    if alternative in ['two-sided', '2-sided', '2s']:
        pvalue = stats.norm.sf(np.abs(zstat))*2
        print('alternative is two-sided.')
        print('pvalue: ' + str(pvalue))
        
    elif alternative in ['larger', 'l']:
        pvalue = stats.norm.sf(zstat)
    elif alternative in ['smaller', 's']:
        pvalue = stats.norm.cdf(zstat)
    else:
        raise ValueError('invalid alternative')
    return zstat, pvalue
In [5]:
df['group'].unique()
Out[5]:
array(['control', 'treatment'], dtype=object)
In [6]:
df['landing_page'].unique()
Out[6]:
array(['old_page', 'new_page'], dtype=object)
In [7]:
len(df)
Out[7]:
294478
In [8]:
df['user_id'].nunique()
Out[8]:
290584
In [9]:
sum(df['converted'] == 1) / len(df['converted'])
Out[9]:
0.11965919355605512
In [10]:
treatment_wrong = sum((df['group'] == 'treatment') & (df['landing_page'] != 'new_page'))
control_wrong = sum((df['group'] == 'control') & (df['landing_page'] == 'new_page'))
total_wrong = treatment_wrong + control_wrong
total_wrong
Out[10]:
3893
In [11]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
user_id         294478 non-null int64
timestamp       294478 non-null object
group           294478 non-null object
landing_page    294478 non-null object
converted       294478 non-null int64
dtypes: int64(2), object(3)
memory usage: 11.2+ MB

No rows appear to be missing any values.

In [12]:
df2 = df
df2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
user_id         294478 non-null int64
timestamp       294478 non-null object
group           294478 non-null object
landing_page    294478 non-null object
converted       294478 non-null int64
dtypes: int64(2), object(3)
memory usage: 11.2+ MB
In [13]:
df2 = df2.loc[~((df2['group'] == 'treatment') & (df2['landing_page'] != 'new_page'))]
df2 = df2.loc[~((df2['group'] == 'control') & (df2['landing_page'] == 'new_page'))]
df2.shape
Out[13]:
(290585, 5)
In [14]:
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]
Out[14]:
0
In [15]:
df2['user_id'].nunique()
Out[15]:
290584
In [16]:
df2.loc[df2['user_id'].duplicated()]
Out[16]:
user_id timestamp group landing_page converted
2893 773192 2017-01-14 02:55:59.590927 treatment new_page 0
In [17]:
df2.loc[df2['user_id'].duplicated()]
Out[17]:
user_id timestamp group landing_page converted
2893 773192 2017-01-14 02:55:59.590927 treatment new_page 0
In [18]:
df2.drop(2893, inplace = True);
In [19]:
df2['user_id'].duplicated().value_counts()
Out[19]:
False    290584
Name: user_id, dtype: int64
In [20]:
conv_prob = sum(df2['converted'] == 1) / len(df2['converted'])
conv_prob
Out[20]:
0.11959708724499628
In [21]:
control_group = df2.loc[df2['group'] == 'control']
control_conv_prob = sum(control_group['converted'] == 1) / len(control_group['converted'])
control_conv_prob
Out[21]:
0.1203863045004612
In [22]:
treatment_group = df2.loc[df2['group'] == 'treatment']
treatment_conv_prob = sum(treatment_group['converted'] == 1) / len(treatment_group['converted'])
treatment_conv_prob
Out[22]:
0.11880806551510564
In [23]:
len(treatment_group) / len(df2)
Out[23]:
0.5000619442226688
In [24]:
actual_prob_diff = control_conv_prob - treatment_conv_prob
actual_prob_diff
Out[24]:
0.0015782389853555567

There appears to be a 0.00158 higher probability of a conversion from a control page than a conversion from a treatment page in this dataset.

Part II - A/B Test

$H_{0}: P_{new} - P_{old} \leq 0$

$H_{1}: P_{new} - P_{old} > 0$

$\alpha: 0.05$

In [25]:
conv_prob
Out[25]:
0.11959708724499628
In [26]:
n_new = len(treatment_group)
n_new
Out[26]:
145310
In [27]:
n_old = len(control_group)
n_old
Out[27]:
145274
In [28]:
new_data_sample = treatment_group.sample(n_new, replace = True)

new_page_converted = new_data_sample['converted'].tolist()

p_new_conv_rate = sum(new_page_converted) / len(new_page_converted)
p_new_conv_rate
Out[28]:
0.11848461909022091
In [29]:
old_data_sample = control_group.sample(n_old)

old_page_converted = old_data_sample['converted'].tolist()

p_old_conv_rate = sum(old_page_converted) / len(old_page_converted)
p_old_conv_rate
Out[29]:
0.1203863045004612
In [30]:
ef_diff = p_new_conv_rate - p_old_conv_rate
ef_diff
Out[30]:
-0.0019016854102402864
In [31]:
p_diffs = np.empty(0)

for x in range(10000):
    new_data_sample = treatment_group.sample(1000, replace = True)
    new_page_converted = new_data_sample['converted'].tolist()
    new_conv_prob = sum(new_page_converted) / len(new_page_converted)
    
    old_data_sample = control_group.sample(1000, replace = True)
    old_page_converted = old_data_sample['converted'].tolist()
    old_conv_prob = sum(old_page_converted) / len(old_page_converted)
    
    difference = new_conv_prob - old_conv_prob
    
    p_diffs = np.append(p_diffs, difference)
In [32]:
plt.hist(p_diffs)
Out[32]:
(array([  13.,  163.,  687., 1796., 2562., 2705., 1497.,  483.,   88.,
           6.]),
 array([-0.054 , -0.0432, -0.0324, -0.0216, -0.0108,  0.    ,  0.0108,
         0.0216,  0.0324,  0.0432,  0.054 ]),
 <a list of 10 Patch objects>)
In [33]:
greater = [i for i in p_diffs if i > actual_prob_diff]
        
print('len(greater): ' + str(len(greater)))
print('len(p_diffs): ' + str(len(p_diffs)))
greater_prop = len(greater) / len(p_diffs)
greater_prop
len(greater): 4234
len(p_diffs): 10000
Out[33]:
0.4234

The p-value is the probability of observing the statistic, or one that is more extreme in favor of the alternative, if the null hypothesis is true. That means that if the p-value is smaller than $\alpha$, then the alternative hypothesis is more likely, and if the p-value is larger than $\alpha$, then the null hypothesis is more likely.

In [34]:
import statsmodels.api as sm

convert_old = sum(control_group['converted'] == 1)
convert_new = sum(treatment_group['converted'] == 1)
n_old = n_old
n_new = n_new

conv_diff = (convert_old / n_old) - (convert_new / n_new)
num_diff = n_new - n_old

conv_diff, num_diff
Out[34]:
(0.0015782389853555567, 36)
In [35]:
z_score, p_value = proportions_ztest([convert_new, convert_old], [n_new, n_old])
z_score, p_value
prop: [0.11880807 0.1203863 ]
k_sample: 2
k_sample == 2 and diff: -0.0015782389853555567
p_pooled: 0.11959708724499628
nobs_fact: 1.3765383026571968e-05
prop_var: False
var_: 1.4494070641686e-06
std_diff: 0.0012039132295014454
zstat: -1.3109241984234394
alternative is two-sided.
pvalue: 0.18988337448195103
Out[35]:
(-1.3109241984234394, 0.18988337448195103)

The p-value is greater than the alpha of 0.05, so we fail to reject the null hypothesis according to this.

This does not agree with the findings of the earlier findings.

Part III - A Regression Approach

In [36]:
df2['intercept'] = 1
df2['ab_page'] = 0

df2.loc[df2['group'] == 'treatment', 'ab_page'] = 1
In [37]:
df2.head(20)
Out[37]:
user_id timestamp group landing_page converted intercept ab_page
0 851104 2017-01-21 22:11:48.556739 control old_page 0 1 0
1 804228 2017-01-12 08:01:45.159739 control old_page 0 1 0
2 661590 2017-01-11 16:55:06.154213 treatment new_page 0 1 1
3 853541 2017-01-08 18:28:03.143765 treatment new_page 0 1 1
4 864975 2017-01-21 01:52:26.210827 control old_page 1 1 0
5 936923 2017-01-10 15:20:49.083499 control old_page 0 1 0
6 679687 2017-01-19 03:26:46.940749 treatment new_page 1 1 1
7 719014 2017-01-17 01:48:29.539573 control old_page 0 1 0
8 817355 2017-01-04 17:58:08.979471 treatment new_page 1 1 1
9 839785 2017-01-15 18:11:06.610965 treatment new_page 1 1 1
10 929503 2017-01-18 05:37:11.527370 treatment new_page 0 1 1
11 834487 2017-01-21 22:37:47.774891 treatment new_page 0 1 1
12 803683 2017-01-09 06:05:16.222706 treatment new_page 0 1 1
13 944475 2017-01-22 01:31:09.573836 treatment new_page 0 1 1
14 718956 2017-01-22 11:45:11.327945 treatment new_page 0 1 1
15 644214 2017-01-22 02:05:21.719434 control old_page 1 1 0
16 847721 2017-01-17 14:01:00.090575 control old_page 0 1 0
17 888545 2017-01-08 06:37:26.332945 treatment new_page 1 1 1
18 650559 2017-01-24 11:55:51.084801 control old_page 0 1 0
19 935734 2017-01-17 20:33:37.428378 control old_page 0 1 0
In [38]:
log_mod = sm.Logit(df2['converted'], df2[['intercept', 'ab_page']])
results = log_mod.fit()
Optimization terminated successfully.
         Current function value: 0.366118
         Iterations 6
In [39]:
results.summary()
Out[39]:
Logit Regression Results
Dep. Variable: converted No. Observations: 290584
Model: Logit Df Residuals: 290582
Method: MLE Df Model: 1
Date: Thu, 13 Feb 2020 Pseudo R-squ.: 8.077e-06
Time: 00:10:50 Log-Likelihood: -1.0639e+05
converged: True LL-Null: -1.0639e+05
Covariance Type: nonrobust LLR p-value: 0.1899
coef std err z P>|z| [0.025 0.975]
intercept -1.9888 0.008 -246.669 0.000 -2.005 -1.973
ab_page -0.0150 0.011 -1.311 0.190 -0.037 0.007

P-value: 0.190 It appears to be rounded to the thousandth place.

In [40]:
countries_df = pd.read_csv('countries.csv')
countries_df.head()
Out[40]:
user_id country
0 834778 UK
1 928468 US
2 822059 UK
3 711597 UK
4 710616 UK
In [41]:
df3 = pd.DataFrame.join(df2, countries_df['country'])
df3.head()
Out[41]:
user_id timestamp group landing_page converted intercept ab_page country
0 851104 2017-01-21 22:11:48.556739 control old_page 0 1 0 UK
1 804228 2017-01-12 08:01:45.159739 control old_page 0 1 0 US
2 661590 2017-01-11 16:55:06.154213 treatment new_page 0 1 1 UK
3 853541 2017-01-08 18:28:03.143765 treatment new_page 0 1 1 UK
4 864975 2017-01-21 01:52:26.210827 control old_page 1 1 0 UK
In [42]:
df3['country'].value_counts()
Out[42]:
US    200926
UK     71501
CA     14315
Name: country, dtype: int64
In [43]:
df3[['CA','UK', 'US']] = pd.get_dummies(df3['country'])
df3 = df3.drop('US', axis=1)
df3.head()
Out[43]:
user_id timestamp group landing_page converted intercept ab_page country CA UK
0 851104 2017-01-21 22:11:48.556739 control old_page 0 1 0 UK 0 1
1 804228 2017-01-12 08:01:45.159739 control old_page 0 1 0 US 0 0
2 661590 2017-01-11 16:55:06.154213 treatment new_page 0 1 1 UK 0 1
3 853541 2017-01-08 18:28:03.143765 treatment new_page 0 1 1 UK 0 1
4 864975 2017-01-21 01:52:26.210827 control old_page 1 1 0 UK 0 1
In [44]:
log_mod2 = sm.Logit(df3['converted'], df3[['intercept', 'ab_page', 'CA', 'UK']])
results2 = log_mod2.fit()
Optimization terminated successfully.
         Current function value: 0.366116
         Iterations 6
In [45]:
results2.summary()
Out[45]:
Logit Regression Results
Dep. Variable: converted No. Observations: 290584
Model: Logit Df Residuals: 290580
Method: MLE Df Model: 3
Date: Thu, 13 Feb 2020 Pseudo R-squ.: 1.420e-05
Time: 00:10:52 Log-Likelihood: -1.0639e+05
converged: True LL-Null: -1.0639e+05
Covariance Type: nonrobust LLR p-value: 0.3884
coef std err z P>|z| [0.025 0.975]
intercept -1.9891 0.009 -224.021 0.000 -2.007 -1.972
ab_page -0.0150 0.011 -1.308 0.191 -0.037 0.007
CA -0.0258 0.027 -0.959 0.338 -0.078 0.027
UK 0.0065 0.013 0.485 0.628 -0.020 0.033

Given that the country p-values are higher than 0.05, the country does not appear to have an impact of conversion.