/********************************************************************************/ /* */ /* File name: Logistic Regression using SAS program - examples.sas */ /* */ /* Modelled by: Voytek Grus, Nova Scotia Power Inc, Regulatory Affairs */ /* (circa 02/2006) */ /* */ /* Illustrations of Logistic Regression features available in SAS software */ /* presented by Voytek Grus at Shrugs workshop held on February 24, 2006 */ /* */ /********************************************************************************/ options ps=64 ls=100 pageno=1; proc datasets; contents data=resultslr; run; /*****************************************************************************************/ data; odds1=exp(-3.0547); odds2=exp(-1.0923); put odds1 odds2; run; DATA; rsq1=1-exp(-(77.8364**2)/147); rsq2=1-(188.491/110.655)**(2/147); rsqmax=1-(77.8364**(2/147)); PUT rsq1 rsq2 rsqmax; RUN; /*******************************************************************************************************************************/ /*******************************************************************************************************************************/ /* EXAMPLE I: Penalty Data */ /* Data: 147 death penalty cases from New Jersey. All defendents convicted of first-degree murder. Prosecuter recommends death sentance. Penalty trials determine whether death sentence or life imprisonment. Variables: Serious - a rating of the seriousness of the crime by a panel of judges. Culp - culpability of a defendant as based on aggrvating/mitigating circumstances (culp=1 denotes the lowest culpability) blackd - race of a defendant whitvic - race of a victim Purpose of analysis: determine wether and how the outcomes of the trial were influenced by various characteristics of the defendant and the crimes. */ options ps=64 ls=100 pageno=1; proc print data=lrdata.penalty(obs=20); run; /* I 1.0 Linear OLS versus Logistic regression */ /*********************************************************************************************/ %let predictors=blackd whitvic culp serious; /* blackd whitvic culp serious serious */ options ps=64 ls=100 pageno=1; proc reg data=lrdata.penalty outest=stats_ols; model death=&predictors / STB CORRB TOL VIF /* selection=stepwise details=summary slentry=0.2 slstay=0.2*/; output out=resultsols p=pred_ols; run; options ps=64 ls=100 pageno=1; PROC LOGISTIC DATA=lrdata.penalty outest=stats_lr desc; MODEL death=&predictors / STB RSQ /* selection=stepwise details=summary slentry=0.2 slstay=0.2 */; output out=resultslr p=pred_lr; RUN; PROC GENMOD DATA=lrdata.penalty desc; MODEL death=&predictors /Dist=Binomial Link=logit type3; RUN; /*********************************************************************************************/ /* I 1.1 Compare results from OLS and LR regressions */ proc sort data=resultsols out=resultsols; by death &predictors; run; proc sort data=resultslr out=resultslr; by death &predictors; run; %let pcutoff=0.3; data comparison; merge resultsols(keep=death &predictors pred_ols) resultslr(keep=death &predictors pred_lr); by death &predictors; death_ols=pred_ols gt 0.5; if pred_lr gt &pcutoff then death_lr=death; else do; if death=1 then death_lr=0; if death=0 then death_lr=1; end; error_ols=(death-pred_ols)**2;error_lr=(death-pred_lr)**2; correct_ols=death eq death_ols; correct_lr=death eq death_lr; run; proc means data=comparison n noprint; by death; var correct_ols correct_lr error_ols error_lr; output out=summary(drop=_TYPE_ _freq_) sum=correct_ols correct_lr SSerror_ols SSerror_lr n=actual; run; data summary; set summary; correct_ols_p=correct_ols/actual; correct_lr_p=correct_lr/actual; format correct_ols_p correct_lr_p percent7.1; run; proc print data=summary; var death actual correct_ols correct_ols_p correct_lr correct_lr_p SSerror_ols SSerror_lr; sum actual correct_ols correct_lr SSerror_ols SSerror_lr; run; data coefficients; set stats_ols stats_lr; run; options ps=64 ls=130 pageno=1; proc print data=coefficients; run; options ps=64 ls=110 pageno=1; proc print data=comparison(obs=20); run; /*********************************************************************************************/ /* I 1.2 Further Calibration */ options ps=64 ls=100 pageno=1; PROC LOGISTIC DATA=lrdata.penalty outest=stats_lr desc; class culp; MODEL death=blackd whitvic culp serious serious2 / STB RSQ selection=stepwise slentry=0.2 slstay=0.2; output out=resultslr p=pred_lr; RUN; PROC GENMOD DATA=lrdata.penalty desc; class culp; MODEL death=blackd culp /Dist=Binomial Link=logit type3; RUN; /****************************************************************************/ /***************************************************************************************************/ /* A detour - demonstration of a ODS facility */ /* ODS enhancements */ filename ODSOUT 'C:\Results'; ods listing close; ods html PATH=ODSOUT FILE='graphics.html'; ods graphics on; proc logistic data=lrdata.penalty desc; model death=blackd whitvic culp / clparm=wald clodds=pl outroc=roc1;/* Receiver Operating Characteristic Curves */ graphics estprob; run; ods graphics off; ods html close; ods listing; options ps=64 ls=100 pageno=1; title 'Receiver Operating Characteristic Table'; proc print data=roc1; run; /****************************************************************************/ /******************************************************************************/ /* Contingency table versus Logistic regression */ /* Diagnostics */ /* 1.0 Adequacy of Expected Frequencies to see if there are any restrictions on the goodness-of-fit criteria */ options ps=64 ls=100 pageno=1; PROC FREQ DATA=lrdata.penalty order=data; /* WEIGHT frequency; */ TABLES death * (blackd whitvic culp) / chisq out=FreqCnt outexpect sparse; title1 'Frequency Tables'; run; /******************************************************************************/ /* Variations of BINARY LOGISTIC REGRESSION USIN PROC LOGISTIC */ options ps=64 ls=100 pageno=1; PROC LOGISTIC DATA=lrdata.penalty desc; class culp; MODEL death=blackd whitvic culp / link=logit /* stb CLPARM=PL */; output out=resultslr p=pred_lr; RUN; /* PROC LOGISTIC WITH SOME OPTIONS */ PROC LOGISTIC DATA=lrdata.penalty desc; CLASS culp; MODEL death=blackd|whitvic|culp / LACKFIT AGGREGATE RSQ STB link=logit scale=williams technique=newton CLODDS=PL CLODDS=WALD SELECTION=stepwise CORRB influence iplots; UNITS culp=2 / DEFAULT=1; Output out=results pred=phat lower=lb upper=up reschi=stres dfbetas=dfs; RUN; /******************************************************************************/ /* BINARY LOGISTIC REGRESSION USIN PROC GENMOD */ /* can produce likelihood ratio tests for ind. coefficients which are superior to Wald tests */ PROC GENMOD DATA=lrdata.penalty desc; CLASS culp; MODEL death=blackd whitvic culp /Dist=Binomial Link=logit /* type3 */; RUN; /******************************************************************************/ /******************************************************************************/ /* MULTINOMIAL LOGIT MODELS: NOMINAL RESPONSE */ /* EXAMPLE II: Survey of 195 university students to study the effects of parenting styles on altruistic behaviour. Survey questions what would you do if you found a wallet with money in it? 1. keep both 2. keep money 3. return both IVs included sex, univ. dept., punish, explain. */ options ps=64 ls=100 pageno=1; proc print data=lrdata.wallet(obs=20); run; options ps=64 ls=100 pageno=1; PROC FREQ DATA=lrdata.wallet order=data; /* WEIGHT frequency; */ TABLES (male business punish explain) * (male business punish explain) / chisq out=FreqCnt outexpect sparse; title1 'Frequency Tables'; run; /* Unordered responses */ PROC LOGISTIC DATA=lrdata.wallet /* desc */; /* class punish; */ MODEL wallet = male business punish explain /STB RSQ link=glogit; RUN; /* Ordered responses */ PROC LOGISTIC DATA=lrdata.wallet /* desc */; /* class punish; */ MODEL wallet = male business punish explain /STB RSQ link=clogit; RUN; /* Can use contrast statement to test whether difference between eq1 vs eq2 is significant */ PROC CATMOD DATA=lrdata.wallet; DIRECT male business punish explain; MODEL wallet = /* _response_ */ male business punish explain / NOITER PRED; RUN; /******************************************************************************/ /******************************************************************************/ /* Example of Conditional Logistic Regression */ /* DISCRETE CHOICE ANALYSIS USING PROC MDC */ /* Customer Choice Model Travel Example Survey of 210 people about their preference for one of the four travel options: air, train, bus or car. Travel characteristics included: terminal waiting time, total travel time, cost, household income, traveling party size. */ options ps=64 ls=100 pageno=1; data travel1(drop=ttme); set lrdata.travel; ttime=ttme; choice = 2- choice; air= mode EQ 1; train= mode EQ 2; bus= mode EQ 3; airhinc=air*hinc; airpsize=air*psize; trahinc=train*hinc; trapsize=train*psize; bushinc=bus*hinc; buspsize=bus*psize; run; data travel2; set travel1; choice=2-choice; run; options ps=64 ls=110 pageno=1; proc print data=travel2(obs=20); run; PROC PHREG DATA=travel1 NOSUMMARY; MODEL choice = ttime time cost / TIES=DISCRETE RL; STRATA id; RUN; proc mdc data=travel2; model choice = ttime time cost / type=clogit nchoice=4; id id; run; /* Full-blown model */ PROC PHREG DATA=travel1 NOSUMMARY; MODEL choice = air bus train airhinc bushinc trahinc airpsize buspsize trapsize ttime time cost / TIES=DISCRETE; STRATA id; RUN; proc mdc data=travel2; model choice = air bus train airhinc bushinc trahinc airpsize buspsize trapsize ttime time cost / type=clogit nchoice=4; id id; run; /* Example of nested logit modeling */ proc mdc data=travel2 maxit=200 outest=a; model choice = ttime time cost / type=nlogit choice=(mode); id id; utility u(1,1 2 3 @ 1) = ttime time cost, u(1,4 @ 2) = time cost; nest level(1) = (1 2 3 @ 1, 4 @ 2), level(2) = (1 2 @ 1); run; /******************************************************************************/ /******************************************************************************/ /* LOGIT ANALYSIS OF CLUSTERED DATA (A CASE OF LONGITUDINAL CLUSTER) PROC GENMOD (GEE) and PROC PHREG database: A sample of 316 people who survived wood fires near Philadelphia. They were interviewed at 3 months, 6 months, 12 months after the fire. purpose: to eplore the factors behind post-traumatic stress disorder (PTSD). The explanatory variables include Control: a scale of a person's perceived control over several areas of life Problems: the total # of problems reported in several areas of life Sevent: the # of stressful events reported since last interview Cohes: a scale of family cohesion measured only at the initial interview and duplicated across the 3 records of each person Subjid: a unique identification for each person */ options ps=64 ls=110 pageno=1; proc print data=lrdata.ptsd(obs=20); run; /* Step 1.0 Regular Logistic Regression: assume any effects are contemporaneous (no lags no leads) all observations are independent */ options ps=64 ls=100 pageno=1; PROC LOGISTIC DATA=lrdata.ptsd desc; class time; MODEL ptsd = control problems sevent cohes time / link=logit; RUN; /* Step 1.1 testing for lag variable effect */ data ptsd; set lrdata.ptsd; by subjid; ptsd1=lag1(ptsd); if first.subjid then ptsd1=0; /* if last.subjid then ptsd1=0;*/ run; options ps=64 ls=110 pageno=1; proc print data=ptsd(obs=20); run; PROC LOGISTIC DATA=ptsd desc; class time; MODEL ptsd = control problems sevent cohes time ptsd1 / link=logit; RUN; /* dependence among observations may result in underestimated standard errors and thus overestimated Wald chi-square stats. Coefficients remain unbiased */ /* Possible Remedies: /* Step 20. Use GEE (Generalized estimating equation: iterative generalized least squares) Estimation in PROC GENMOD */ /* GEE estimation is invoked with REPEATED statement SUBJECT option - identification code for each cluster WITHIN option - identification of different times within cluster TYPE optiont - specifies strucure of the correlation matrix */ PROC GENMOD DATA=lrdata.ptsd DESC; CLASS subjid time; MODEL ptsd= control problems sevent cohes time / D=B; REPEATED SUBJECT=subjid / WITHIN=time TYPE=UN CORRW; RUN; proc logistic data=lrdata.ptsd; class time; strata subjid; model ptsd(event='1')= control problems sevent cohes time / clodds=Wald; run; /* Alternative Way */ data ptsd; set lrdata.ptsd; ptsd=2-ptsd; t1=time EQ 1; t2=time EQ 2; run; /* Use strata statement to group observations into clusters. TIES option is necessary to force PHREG to estimate a logit model rather than some inappropriate survival model */ PROC PHREG DATA=ptsd NOSUMMARY; MODEL ptsd= control problems sevent cohes t1 t2 / TIES=DISCRETE; STRATA subjid; run; /* b) Fixed-Effects with Conditional Analysis GEE does not correct for bias resuling from omitted explanatory variables at the cluster level. When there are multiple observations per person it is possible to statistically control for all stable characteristics of persons, regardless whether these characteristics can be measured. log (pi/(1-pi)=ai + BXi. ai term represents all differences among individuals that are stable over time. a can be treated as a random variable - random effect model or ai can be treated as a set of fixed constants - fixed effect model. Should the response be continuous could introduce many multiple level respone variables (dummies). That does not work with logistic regression. Can only do this if # of clusters is small and there are many observations per cluster. Have insufficient sample size to estimate the pair effect. Logis. Reg. requires adequate frequencies for all pairs of discrete predictors in order for the goodness-of-fit statistics to be reliable. Rule of thumb is that no more than 20% of the cells have frequencies less than 5. Incidental Parameter Problem: according to the asymptotic theory of maximum likelihood estimation, it is usually assumed that the number of observations gets large while the # of parameters to be estimated remains constant. A solution to the incidential parameter problem is a conditional likelihood estmiation. By using conditional arguments, one can eliminate the pair effect and estmiate other effect. Conditional logistic regr. takes into account stratification by basing the maximum likelihood estimation of parameters on a conditional likelihood. The estimation is going through the following process: given that k events happened to this individual, what factors explain why they happened at these particular times and not at other times? Conditional Logit Model has a likelihood that's identical to certain discrete-time survival models that can be estimated with PROC PHREG. */ /* create dummy variables for time variable because PHREG does not have a class statement */ proc logistic data=ptsd; strata subjid; model ptsd(event='1')= control problems sevent cohes t1 t2 / clodds=Wald; run; /* Conditional Logit only take use of those clusters that show variantion within. No variation among clusters is used in the process. Therefore there is loss of information. In this case the conditional logit tries to explain why a person with a certain # of PTSD symptoms had them at some times and not at other times. If there is no variantion across all 3 intervies there is nothing to explain. These observations are discarded (37 + 144 that's 57% of observations. As a result the std errors are much bigger. /* The difference between fixed-effect and GEE models is a trade-off between bias and inefficiency. Fixed effect model is less prone to bias because the cond. ll discards information that is potentially contaminated by confounding variables. But if confounding variables are not a problem then the fixed effect model can ignore potentially useful information.