* PropVal3 Example -- revised source code file propval3.sas; options linesize=70 nodate pageno=1; data propval; infile "c:/datasets/propval.txt" firstobs=15; input y x1-x9; title "PropVal3 Example"; label y="Sale price of house ($1000)" x1="Taxes (local, school, county)" x2="Number of baths" x3="Lot size (1000 sq ft)" x4="Living space (sq ft x 1000)" x5="Number of garage stalls" x6="Number of rooms" x7="Number of bedrooms" x8="Age of home (years)" x9="Number of fireplaces"; * Split dataset into training set (Selected=1) and test set (Selected=0). Training set is used to fit model, test set is used to validate model. Set seed so everyone that runs this code gets the same result; proc surveyselect data=propval method=srs seed=49143 outall samprate=0.5 out=subsets; * Old Code before revision; * proc surveyselect data=propval method=srs seed=43543 * samprate=0.5 rep=2 out=subsets; proc print data=subsets; title2 "subsets dataset"; * Use observations with Selected=1 for training set, Use observations with Selected=0 for test set; data newpropval; set subsets; if Selected=0 then do; ynew = y; y = .; end; drop x1 x4 x6 x7 x9; proc print data=newpropval; title2 "newpropval dataset"; * Derive model using only observations with Replicate=1; proc reg; model y = x2 x3 x5 x8; output out=out p=predict r=resid; * Add column called matchcol containing all * ones to allow out1 to be merged with out2 * below; data out1; set out; matchcol = 1; proc print data=out1; title2 "out2 dataset"; * Get mean of y from training set (Selected=1); proc means n mean; var y; output out=ymean mean=ybar; * Add column called matchcol containing all * ones to allow out1 above be be merged with * out2; data out2; set ymean; matchcol = 1; proc print data=out2; * Compute square of residual (ynew-predict) for each observation in validation subset; data sse; title2 "sse dataset"; merge out1 out2; by matchcol; if Selected=0; r2 = (ynew - predict)**2; d2 = (ynew - ybar)**2; keep ynew predict r2 ybar d2; proc print data=sse; * Compute SSE (sum of r2) and SST (sum of d2); proc means data=sse sum; title2 "Compute SSE"; var r2 d2; output out=sums sum = / autoname; proc print data=sums; * Get SSE and SST from sums dataset and * compute R^2 for prediction; data final; set sums; r2predict = 1 - (r2_Sum/d2_Sum); * Print final R^2 for prediction (r2predict); proc print; var r2predict; run; quit;