program blogit_2P, eclass //version 0: first programmed 18 decembre 2007 version 8.0 syntax varlist(min=5) [, method(string) design] gettoken d varlist : varlist gettoken z varlist : varlist gettoken nS1 varlist : varlist gettoken nS2 indepvars : varlist tokenize `indepvars' //**************************************************************************** di di "logistic regression for grouped two-phase data" di // check whether all variables are numeric confirm numeric variable `nS1' `nS2' `indepvars' // test whether d is a binary 0/1 variable preserve qui drop if `d'==0 | `d'==1 qui summ `d' if r(N)!=0 { di in red "Variable `d' can take only values 0 and 1" error 459 exit } restore // test whether z is an integer variable from 1 to n preserve qui summ `z' local maxz= r(max) forvalues i=1/`maxz' { qui drop if `z'==`i' } qui summ `d' if r(N)!=0 { di in red "Variable `z' can take only integer values from 1 to `maxz'" error 459 exit } restore capture assert `nS1' == int(`nS1') if _rc { di in blu "note: you are responsible for " /* */ "interpretation of non-count Phase One variable" } capture assert `nS2' == int(`nS2') if _rc { di in blu "note: you are responsible for " /* */ "interpretation of non-count Phase Two variable" } //********************** preliminary computations and output *********************************** tempname Estimate CovMat qui summ `z' local nstrata=r(max) di di "number of strata : " %6.0f `nstrata' di qui { preserve sort `z' `d' by `z' `d' : keep if _n==1 qui summ `nS1' if `d'==1 local ncases1=r(sum) qui summ `nS1' if `d'==0 local ncontrols1=r(sum) restore } di "total number of cases (`d'=1) in Phase One : " %6.0f `ncases1' di "total number of controls (`d'=0) in Phase one : " %6.0f `ncontrols1' di qui summ `nS2' if `d'==1 local ncases2=r(sum) qui summ `nS2' if `d'==0 local ncontrols2=r(sum) di "total number of cases (`d'=1) in Phase Two : " %6.0f `ncases2' di "total number of controls (`d'=0) in Phase Two : " %6.0f `ncontrols2' di if "`design'"=="design" { di "study design" di "Stratum number Phase One Phase Two" di " Cases Controls Cases Controls" forvalues i=1/`nstrata'{ qui { preserve keep if `z'==`i' gsort `d' local ntem1 = `nS1'[1] gsort -`d' local ncas1 = `nS1'[1] qui summ `nS2' if `d'==0 local ntem2 = r(sum) qui summ `nS2' if `d'==1 local ncas2 = r(sum) restore } di %2.0f `i' /// " " %9.2f `ncas1' %9.2f `ntem1' /// " " %9.2f `ncas2' %9.2f `ntem2' } di } //*********************************** ML algorithm ************************************** if "`method'"!="WL" { di " Maximum Likelihood estimation using the EM algorithm // // begin EM algorithm // // nS1 contains the total dxZ S1 counts tempvar sumS2z NS1 num2 lf sort `z' `d' by `z' `d': egen `sumS2z'=sum(`nS2') gen `lf'=log(`sumS2z'/`nS1') // lf contains log(nij/Nij) gen `NS1'= `nS1'-`sumS2z' // NS1 contains the dxZ S1\S2 counts sort `z' `indepvars' `d' //______________________________________________________________ // prepare design variables zx* //______________________________________________________________ qui summ `z' tempname nobs local i 1 scalar `nobs'=r(N) egen `num2'=fill(1 1 2 2) local nzx=`nobs'/2 local listzxtmp "" while `i' <= `nobs'/2 { tempvar zx`i' local listzxtmp "`listzxtmp' `zx`i''" qui gen `zx`i''= (`num2'==`i') local i= `i'+1 } * update independent variables local i 1 local indvartmp "" while "``i''" != "" { tempvar ``i''0 local indvartmp "`indvartmp' ```i''0'" qui gen ```i''0' =`d'*``i'' local i=`i'+1 } //______________________________________________________________ // EM algorithm //______________________________________________________________ tempname b coefEMn coefEMp diff modit local it 1 tempvar nn musum muijk pi_hat nS1EM gen `nn'=`nS2' * M step1 qui poisson `nn' `listzxtmp' `indvartmp' `d' , offset(`lf') nocon qui predict `muijk' matrix `coefEMn'=e(b) * E step sort `d' `z' quietly { by `d' `z' : gen `musum'=sum(`muijk') by `d' `z' : replace `musum'=`musum'[_N] gen `pi_hat'=`muijk'/`musum' gen `nS1EM'=`NS1'*`pi_hat' replace `nn'=`nS2'+`nS1EM' drop `muijk' * M step poisson `nn' `listzxtmp' `indvartmp' `d' , nocon predict `muijk' matrix `coefEMp'=`coefEMn' matrix `coefEMn'=e(b) mat `b'= `coefEMn' scalar `diff'=mreldif(`coefEMn', `coefEMp') } while `diff' > 0.0000001 & `it'<=1000 { * E step quietly { by `d' `z' : replace `musum'=sum(`muijk') by `d' `z' : replace `musum'=`musum'[_N] qui replace `pi_hat'=`muijk'/`musum' drop `muijk' replace `nS1EM'=`NS1'*`pi_hat' replace `nn'=`nS2'+`nS1EM' * M step local it =`it' +1 poisson `nn' `listzxtmp' `indvartmp' `d' , nocon predict `muijk' matrix `coefEMp'=`coefEMn' matrix `coefEMn'=e(b) scalar `diff'=mreldif(`coefEMn', `coefEMp') scalar `modit'=mod(`it',100) } if `modit' == 0 { di "number of iterations " `it' " current difference " `diff' } mat `b'= `coefEMn' } di "number of iterations " `it' //______________________________________________________________ * get variance estimators //______________________________________________________________ tempvar zd rw tempname pi nzd nnS1 muijk_matrix V VV gen `zd' = `z'+`nstrata'*`d'+1 gsort `zd' mkmat `pi_hat',mat(`pi') mkmat `zd',mat(`nzd') mkmat `NS1',mat(`nnS1') mkmat `muijk',mat(`muijk_matrix') * compute bloc diag matrix VV of pi local i 1 local k 1 scalar `nobs' =rowsof(`nnS1') mat `VV' =J(`nobs',`nobs',0) while `k' < `nobs'{ local nsc=`nzd'[`k',1] while `nzd'[`i',1] == `nsc' { local j=`k' while `nzd'[`j',1] == `nsc' { if `i' == `j' { mat `VV'[`i',`j'] =`pi'[`i',1]*(1-`pi'[`i',1]) } else { mat `VV'[`i',`j']=-`pi'[`i',1]*`pi'[`j',1] } local j=`j'+1 } local i = `i'+1 } local k= `i' } egen `rw'=fill(1 2) * get bloc diag multinomial covariance tempname V1 V2 V I b1 mat `V1'=diag(`muijk_matrix') mat `V2'=`VV'*diag(`nnS1') mat `V'=`V1'-`V2' mat glsaccum `I'= `listzxtmp' `indvartmp' `d' ,group(`zd') glsmat(`V') row(`rw') nocon mat `V'= syminv(`I') //______________________________________________________________ * extract coefficients //______________________________________________________________ local ib =`nzx' +1 local ib1 1 local df=e(df_m) local size = `df'-`nzx' mat `Estimate'=`b'[1,`ib'...] mat `CovMat' =`V'[`ib'...,`ib'...] local N=e(N) } // // end EM algorithm // //********************************** WL *************************** else { // WL algorithm di " Weighted Likelihood estimation // // begin WL algorithm // local method "WL" tempvar nij num2 fij w Info_ij score const sqrtnS2 sqrtInfo num xb tempname A B Aij Bij Dij CovUij sort `z' `d' by `z' `d': egen `nij'=sum(`nS2') gen `fij'=`nij'/`nS1' // fij contains nij/Nij gen `w'=`nS2'/`fij' qui logit `d' `indepvars' [iw=`w'] _predict `xb',xb mat `Estimate'=e(b) local nvar=wordcount("`indepvars'") /*_____________________________________________________________ get variance estimators according to Schill and Drescher SIM 1997 (see also tech report 2P-24 in which a mistake is corrected) __________________________________________________________ */ local size=e(df_m)+1 mat `A'=J(`size',`size',0) mat `B'=J(`size',`size',0) mat `CovUij'=J(`size',`size',0) /* generate score variables */ gen `const'=1 gen `Info_ij'=`nS2'*exp(`xb')/((1+exp(`xb'))^2) gen `score'=`d'-1/(1+exp(-`xb')) local indepscore="" forvalues l=1/`nvar' { tempvar S`l' *di "S`1' ``1''" gen `S`l''=`score'*``l'' local indepscore="`indepscore' `S`l''" } // add constant local indepscore="`indepscore' `score'" forvalues i=0/1 { forvalues j=1/`nstrata' { preserve qui keep if `d'==`i' & `z'==`j' qui summ `fij' local f_ij=r(mean) qui summ `nij' local n_ij=r(mean) gen `sqrtnS2'=sqrt(`nS2') gen `sqrtInfo'=sqrt(`Info_ij') gen `num'=_n sort `num' mat opaccum `Aij'= `indepscore', group(`num') opvar(`sqrtnS2') noconstant mat opaccum `Bij'= `indepvars' `const' , group(`num') opvar(`sqrtInfo') noconstant mat vecaccum `Dij'= `nS2' `indepscore', noconstant mat `CovUij'= (`Aij'-`Dij''*`Dij'/`n_ij') mat `A'=`A'+`CovUij'*(1-`f_ij')/(`f_ij'*`f_ij') mat `B'=`B'+`Bij'/`f_ij' restore } } tempname Bread mat `Bread'=syminv(`B') mat `CovMat'=`Bread'+`Bread'*`A'*`Bread' // // end WL algorithm // } //______________________________________________________________ // post estimates //______________________________________________________________ local name = subinstr("`indepvars'"," "," :",.) + " :_cons" mat coln `Estimate'= `name' mat coln `CovMat' = `name' mat rown `CovMat' = `name' local title "logistic regression for grouped two-phase data" ereturn post `Estimate' `CovMat' ereturn scalar Nstrata =`nstrata' ereturn scalar Ncases_P1 =`ncases1' ereturn scalar Ncontrols_P1 =`ncontrols1' ereturn scalar Ncases_P2 =`ncases2' ereturn scalar Ncontrols_P2 =`ncontrols2' ereturn local method "`method'" ereturn local depvar "`d'" ereturn local strata "`z'" ereturn local P1counts "`nS1'" ereturn local P2counts "`nS2'" ereturn display end