/*************************************************************************************************************/ /* Dataset */ data fueldata; set dw.FuelData; where obsno lt 49; run; options ps=64 ls=140 pageno=1; title1 'Motor Fuel Consumption Data (gallons per person) by the USA states (1971)'; proc print data=FuelData; run; options ps=64 ls=100 pageno=1; proc reg data=FuelData lineprinter; model FuelUse = Drivers Tax Income RoadMiles / acov; run;quit; /*************************************************************************************************************/ /*************************************************************************************************************/ /* Step #1 */ /* Variable screening process: Two ways to proceed about it. Can model all the steps involved or use ODS graphics with the Automated Variable Selection methods to do everything in one shot. */ /* Variable Screening Process: Step by step */ /* First look at the data */ options ps=64 ls=100 pageno=1; proc reg data=FuelData lineprinter; model FuelUse = Drivers Tax Income RoadMiles / stb ss1 scorr1 ss2 scorr2 partial; plot r.*p.; plot FuelUse * Tax; run;quit; /* Many Diagnostics of violation of assumptions available */ /* tests for normalit,y Heteroscedasticity(heterogenous variance), autocorrelation, collinearity */ options ps=64 ls=160 pageno=1; proc reg data=FuelData; model FuelUse = Drivers Tax Income RoadMiles / spec dw tol vif collin influence; run;quit; /* More detailed discovery process */ proc sort data=FuelData out=Sorted; by region; run; /* Illustration of flexibility in data modeling within the proc reg */ options ps=64 ls=100 pageno=1; proc reg data=Sorted; by region; where state ne 'WY'; MODEL FuelUse = Tax; MODEL FuelUse = Tax Drivers; MODEL FuelUse = Income; MODEL FuelUse = RoadMiles; MulitpleReg: MODEL FuelUse = Tax Drivers Income RoadMiles; run;quit; /*************************************************************************************************************/ /* AUTOMATED WAY for a variable selection */ /* Variable selection can be automated by using various methods such as: stepwise reg, forward, backward, Rsq, MXRL etc */ /* Should be remembered that lots of this can be done automatically using the ODS feature of proc reg. */ ods html;ods graphics on; proc corr data=FuelData plots; var Drivers Tax Income RoadMiles;run; ods graphics off; ods html close; ods html;ods graphics on; proc reg data=FuelData plots(unpackpanels); model FuelUse = Drivers Tax Income RoadMiles / spec dw tol vif collin partial selection=stepwise slentry=0.1 slstay=0.01; run; ods graphics off; ods html close; quit; /*************************************************************************************************************/ /*************************************************************************************************************/ /*** Some tips to do variable screening ***/ /* Digression: Can use rsreg (quadratic response surface regression) to automate variable screening process even further */ proc rsreg data=FuelData; MODEL FuelUse = Drivers Tax Income RoadMiles; run; /*** Also two other interesting features: tests on subtests of coeficients and regression under restricted coefficients */ /* Example of using subgroup test statements */ options ps=64 ls=140 pageno=1; proc reg data=FuelData ; MODEL FuelUse = Drivers Tax Income RoadMiles / stb ss1 scorr1 ss2 scorr2; test_tax_eq_income: test tax=Income; test_drivers_eq_roadmiles: test drivers=roadmiles; run;quit; * Example of using restrict statement */ options ps=64 ls=140 pageno=1; proc reg data=FuelData; MODEL FuelUse = Drivers Tax Income RoadMiles / stb ss1 scorr1 ss2 scorr2; MODEL FuelUse = Drivers Tax Income RoadMiles / stb ss1 scorr1 ss2 scorr2; restrict_tax_eq_income: restrict tax=Income; MODEL FuelUse = Drivers Tax Income RoadMiles / stb ss1 scorr1 ss2 scorr2; restrict_drivers_eq_roadmiles: restrict drivers=RoadMiles; run;quit; /*************************************************************************************************************/ /*************************************************************************************************************/ /* INFLUENCE DIAGNOSTIC ANALYSIS: MORE ARTS THAN SCIENCE (Fine-tooth comb approach) */ /***************************************************************************************************************************/ /*** Enhance graph features of Proc Reg with an annotate dataset ***/ proc reg data=FuelData noprint; MODEL FuelUse = Drivers Tax Income RoadMiles/ influence; output out=RegOutput p=predicted residual=residuals student=student rstudent=rstudent h=leverage stdr=stdr press=press cookd=cookd dffits=dffits covratio=covratio; run; quit; /* title 'Diagnostic Statistics for Regression Analysis of the USA Fuel Use Data'; options ps=64 ls=220 pageno=1; proc print data=RegOutput;run; */ data graphspecs; length color style text $ 10; retain when 'a' xsys ysys '2' hsys '3'; set RegOutput; function='symbol'; position='5'; select; when (Inclevel='Low ') do;style='marker'; text='U';end; when (Inclevel='Med ') do;style='marker'; text='C';end; when (Inclevel='High') do;style='marker'; text='P';end; otherwise;end; select; when (region='SouthEast ') color='green'; when (region='Mountain ') color='red'; when (region='Pacific ') color='blue'; when (region='NorthEast ') color='brown'; when (region='MidWest ') color='orange'; when (region='NorthCentral') color='black'; otherwise;end; size=2; output; function='label'; style='swissb'; text=state; position='2'; size=2; output; run; proc datasets library=work memtype=(data catalog); delete gseg / memtype=catalog; run; proc reg data=RegOutput; MODEL FuelUse = Drivers Tax Income RoadMiles /* / influence */; symbol1 c=green v=dot h=1; title 'Scatter Plot: Fuel Use versus Tax'; plot FuelUse*Tax /annotate=graphspecs(rename=(FuelUse=y Tax=x))mse vaxis=(300 to 1000 by 25) vref=300 to 1000 by 25 cvref=blue href= 5 to 11 by 0.5 chref=red cframe=ligr; run; title 'Scatter Plot: Residuals versus Predicted Fuel Use'; plot residual.*predicted. /annotate=graphspecs(rename=(residuals=y predicted=x))mse vaxis=(-150 to 250 by 15) vref=-150 to 250 by 15 cvref=blue href= 0 to 9000 by 500 chref=red cframe=ligr; run; title 'Scatter Plot: Studentized Residuals versus Leverage'; plot rstudent.*h. / annotate=graphspecs(rename=(rstudent=y leverage=x))mse haxis=(0 to 0.5 by 0.05) vref= -3 to 5 by 0.5 cvref=blue href= 0 to 48 by 1 chref=red cframe=ligr; ; run; title "Cook's Distance (Composite measure of the influence on coefficients)"; plot cookd.*obs. /annotate=graphspecs(rename=(cookd=y obsno=x)) vaxis=(0 to 0.6 by 0.05) vref= 0 to 0.6 by 0.05 cvref=blue haxis=(0 to 48 by 1) href= 0 to 48 by 1 chref=red cframe=ligr; run; title 'Covratios'; plot covratio.*obs. /annotate=graphspecs(rename=(covratio=y obsno=x)) vaxis=(0.5 to 1.8 by 0.1) vref= 0.5 to 1.7 by 0.1 cvref=blue haxis=(0 to 48 by 1) href= 0 to 48 by 1 chref=red cframe=ligr; run;quit; /***************************************************************************************************************************/ /*** Use Proc Gplot to show dfbetas information on the graphs ***/ ods exclude outputstatistics;*suppresses printing of Influence statistics; proc reg data=FuelData; MODEL FuelUse = Drivers Tax Income RoadMiles/ influence; ods output outputstatistics=influence2; run;quit; data graphdata; merge fueldata influence2(drop=model dependent rename=(observation=obsno)); by obsno; label dffits=' '; run;quit; options ps=64 ls=200 pageno=1; proc print data=graphdata; run; data graphspecs2; length color style text $ 10; retain when 'a' xsys ysys '2' hsys '3'; set graphdata; function='symbol'; position='5'; select; when (Inclevel='Low ') do;style='marker'; text='U';end; when (Inclevel='Med ') do;style='marker'; text='C';end; when (Inclevel='High') do;style='marker'; text='P';end; otherwise;end; select; when (region='SouthEast ') color='green'; when (region='Mountain ') color='red'; when (region='Pacific ') color='blue'; when (region='NorthEast ') color='brown'; when (region='MidWest ') color='orange'; when (region='NorthCentral') color='black'; otherwise;end; size=2; output; function='label'; style='swissb'; text=state; position='2'; size=2; output; run; proc datasets library=work memtype=(data catalog); delete gseg / memtype=catalog; run; goptions reset=symbol;run; title1 "DFBETAS on the Graph"; legend1 label=none value=none; proc gplot data=graphdata; plot dfb_tax * dfb_drivers / frame caxis=Green haxis=axis1 vaxis=axis2 legend=legend1 autohref autovref lvref=20 lhref=20 annotate=graphspecs2(rename=(dfb_tax=y dfb_drivers=x)); plot dfb_tax * dfb_income / frame caxis=Green haxis=axis1 vaxis=axis2 legend=legend1 autohref autovref lvref=20 lhref=20 annotate=graphspecs2(rename=(dfb_tax=y dfb_income=x)); plot dfb_tax * dfb_roadmiles / frame caxis=Green haxis=axis1 vaxis=axis2 legend=legend1 autohref autovref lvref=20 lhref=20 annotate=graphspecs2(rename=(dfb_tax=y dfb_roadmiles=x)); plot dffits * obsno / frame caxis=Green haxis=axis1 vaxis=axis2 legend=legend1 autohref autovref lvref=20 lhref=20 annotate=graphspecs2(rename=(dffits=y obsno=x)); run; quit; /***************************************************************************************************************************/ /***************************************************************************************************************************/ /*** Outlier Tests ***/ /* Critical values for the outlier tests (at n=48 p=5 and at cl=0.05 tcrit=3.53 and at cl=0.01 tcrit=4.06) */ options ps=64 ls=100 pageno=1; title 'Results of Outlier and Leverage tests using M method in the Robust Regression'; proc robustreg data=FuelData method=M (asympcov=H1); /* Huber (1971) Maximum Likelihood Estimate */ MODEL fueluse = Drivers Tax Income RoadMiles / diagnostics leverage; output out=results1 predicted=predicted residual=residual sresidual=sresidual leverage=leverage outlier=outlier; run; options ps=64 ls=100 pageno=1; title 'Results of Outlier and Leverage tests using M method in the Robust Regression'; proc print data=results1(drop=obsno); where leverage gt 0 or outlier gt 0; run; options ps=64 ls=100 pageno=1; title 'Results of Outlier and Leverage tests using LTS method in the Robust Regression'; proc robustreg data=FuelData method=LTS;/* The least trimmed squares (LTS) estimate proposed by Rousseeuw (1984)High Breakdown Value Estimation*/ MODEL fueluse = Drivers Tax Income RoadMiles / diagnostics leverage; /* output out=results2 predicted=predicted residual=residual sresidual=sresidual leverage=leverage outlier=outlier;*/ run; options ps=64 ls=170 pageno=1; title 'Results of Outlier and Leverage tests using LTS method in the Robust Regression'; proc print data=results2; where leverage gt 0 or outlier gt 0; run; /************************************************************************************************************************************/ /************************************************************************************************************************************/ /* AN EXAMPLE OF AN INTERACTIVE ANALYSIS USING PROC REG */ proc datasets library=work memtype=(data catalog); delete gseg / memtype=catalog; run; goptions reset=global gunit=pct border cback=white colors=(blue green red) ctext=black htitle=5 ftitle=swissb htext=3 ftext=swiss; options ps=64 ls=80 pageno=1; proc reg data=FuelData /* lineprinter */; var Drivers Tax Income RoadMiles; MODEL FuelUse = Drivers Tax Income RoadMiles; symbol1 c=green v=dot h=1; plot residual.*predicted. /sse cp mse cframe=ligr; run; /* To illustrate re-running regression on different set of predictors */ delete RoadMiles;print;run; plot r.*p. / mse cframe=ligr;run; add RoadMiles;print;run; /* Impact on regression analysis of excluding certain observations */ reweight allobs / reset;print;run; /* restore to original results */ reweight obs.=45 / weight=0;refit;print;run; /* impact of Nevada on RA results */ plot r.*p. / mse cframe=ligr;run; reweight allobs / reset;print;run; /* restore to original results */ reweight obs.=40 / weight=0;refit;print;run; /* impact of Wyoming on RA results */ plot r.*p. / mse cframe=ligr;run; reweight allobs / reset;print;run; /* restore to original results */ reweight obs.=19 / weight=0;refit;print;run; /* impact of South Dakota on RA results */ plot r.*p. / mse cframe=ligr;run; /* Test the information from the Influence Diagnostics Tests: covratios and dfbetas etc. */ reweight allobs / reset;print;run; /* restore to original results */ reweight rstudent. > 2 or rstudent. < -0.2 / weight=0;refit;print;run; /* remove all data points with high sresiduals */ plot r.*p. / mse cframe=ligr;run; reweight allobs / reset;print;run; /* restore to original results */ reweight h. > 0.2 / weight=0;refit;print;run; /* remove all points with high leverage */ plot r.*p. / mse cframe=ligr;run; /* Impact of weighting down the impact of influential points by half */ reweight allobs / reset;print;run; /* restore to original results */ reweight obs.=45 or obs.=40 / weight=0.5;refit;print;run; plot r.*p. / mse cframe=ligr;run;