***********************************************************************; * This program computes an indepdent samples t-test using a * simple empirical bootstrap through repeated sampling with * replacement. The program was written by Michael Brannick. ***********************************************************************; data d1; input DV group; *********************************** * the program wants the two * groups to be stacked so that * all the dependent variable data * appear in one column, and the * group identification variable * appears in the second column ***********************************; array items DV group; do over items; if items eq . then delete; end; *********************************** * the above statements delete any * missing observations ***********************************; cards; 8 1 9 1 10 1 9 1 8 1 7 1 8 1 9 1 10 1 7 1 10 2 11 2 12 2 10 2 11 2 12 2 11 2 11 2 9 2 12 2 11 2 proc sort; by group; ***************************************; * this stacks the data by group. ***************************************; proc iml; *********************************************************************; use d1 var{DV Group}; **********************************************************************; * the dataset name that you want to analyze follows the word 'use'. * The name of the variables in the dataset are named after 'var'. **********************************************************************; read all var{DV} into DV; read all var{Group} into Group; *********************************************************************** *** Set up bootstrap estimates ***********************************************************************; maxlabel = max(Group); minlabel = min(Group); totalN = nrow(Group); print maxlabel minlabel totalN; ******************************************* * Find the number of people in each group *******************************************; N1 = Group #(Group = maxlabel); * find the number of people (observations) in group 1; n1 = (N1=0); n1 = n1[+,]; N2 = Group # (Group = minlabel); *find the number of people in group 2; N2 = (N2 = 0); N2 = N2[+,]; print n1 n2; *******************************************; *placeholders for each group data; *******************************************; g1 = j(N1,1,-9); a1 = 1; g2 = j(N2,1,-9); b1 = 1; do i = 1 to nrow(DV); if group[i,1] = minlabel then do; g1[a1,1] = dv[i,1]; a1 = a1+1; end; if group[i,1] = maxlabel then do; g2[b1,1] = DV[i,1]; b1= b1+1; end; end; *print g1 g2; iter = 10000; *the number of samples for the empirical sampling distribution; out1 = j(iter,1,-9); * placeholder for estimates from each sample; do boot = 1 to iter; * main loop to compute number of estimates; g1dat = j(N1,1,-9); * placeholder for raw data for group 1; g2dat = j(N2,1,-9); *placeholder for raw data for group 2; ******************************************************************; do sam = 1 to N1; * pick a sample with replacement for group 1; row = uniform(0)*n1; row = round(row); if row < 1 then row = 1; if row > n1 then row =n1; g1dat[sam,] = g1[row,]; end; * g1dat should fill with data; do sam = 1 to N2; * pick a sample with replacement for group 2; row = uniform(0)*n2; row = round(row); if row < 1 then row = 1; if row > n2 then row =n2; g2dat[sam,] = g2[row,]; end; * g2dat should fill with data; *print g1dat g2dat; *******************************************************************; * Compute mean difference *******************************************************************; M1 = g1dat[+,]/N1; M2 = g2dat[+,]/N2; Diff = M1-M2; ******************************************************************; out1[boot,] = Diff; * Save estimate for the Nth sample; *print diff; end; *******************************************************************; * For a two-tailed test based on 10000 iterations, the extremes are * 2500 and 9750. *******************************************************************; *Find Observed Means; Mean1 =0; do c1 = 1 to totalN; If Group[c1,1] = minlabel then mean1 = mean1+DV[c1,1]; end; Mean1 = mean1/n1; ***; Mean2 = 0; do c1 = 1 to totalN; if group[c1,1] = maxlabel then mean2 = mean2+DV[c1,1]; end; Mean2 = mean2/n2; Obs_Diff = Mean1 - Mean2; * Find bootstrap confindence interval; rankvar = rank(out1[,1]); boot_low = out1[loc(rankvar=250),1]; boot_up = out1[loc(rankvar=9750),1]; print Mean1 Mean2 Obs_Diff; print 'empirical (bootstrap) confidence interval' boot_low boot_up; print 'result is significant if empirical confidence interval DOES NOT contain zero'; *Output the results; Create out1 from out1; append from out1; data d2; set out1; proc univariate plot normal; quit; run;