/*********************************************************************************** Intellectual property of Zeming Data Science, LLC. Unauthorized use might cause damage * PROGRAM NAME : Synchronized Multivariate Resample * DESCRIPTION : This program demonstrate on performing synchronized multivariate resampling based on designated mean or percent. PROC SURVEYSELECT is applied as the core procedure. * PROGRAMMER : Zhiyong (John) Chen* * DATE WRITTEN : 31 JAN 2019* ************************************************************************************/ OPTIONS MLOGIC MPRINT; *Creating a simulated population with 50% of male and female, hispanic and non-hispanic, and average age around 37.5 year old; DATA ORIGINAL_DATA (DROP = I); LENGTH GENDER $6 RACE $15; CALL STREAMINIT(3); DO I = 1 TO 3000; GENDER = 'Male'; IF I LE 1350 THEN RACE = 'Hispanic'; ELSE RACE = 'Non-Hispanic'; AGE = RAND("NORMAL", 37.5, 13); IF AGE < 0 THEN AGE = 1; WEIGHT_VAR = 2*RANUNI(1); OUTPUT; END; DO I = 1 TO 2000; GENDER = 'Female'; IF I LE 900 THEN RACE = 'Hispanic'; ELSE RACE = 'Non-Hispanic'; AGE = RAND("NORMAL", 37.5, 13); IF AGE < 0 THEN AGE = 1; WEIGHT_VAR = 2*RANUNI(1); OUTPUT; END; RUN; PROC MEANS DATA = ORIGINAL_DATA MEAN STDDEV; VAR AGE; ODS OUTPUT SUMMARY = ORIGINAL_AGE_SUMMARY; RUN; PROC UNIVARIATE DATA = ORIGINAL_DATA; VAR AGE; HISTOGRAM; RUN; ODS HTML CLOSE; *Percent of gender and race; PROC FREQ DATA = ORIGINAL_DATA; TABLE GENDER / OUT = ORIGINAL_GENDER_SUMMARY (DROP = COUNT); TABLE RACE / OUT = ORIGINAL_RACE_SUMMARY (DROP = COUNT); RUN; **********Preparing Sample Number for Resampling**********; *Expected percent of gender after resampling; DATA EXPECT_GENDER; LENGTH GENDER $6; INPUT GENDER $ EXPECT_GENDER; FORMAT EXPECT_GENDER 8.2; LABEL GENDER = 'Gender' EXPECT_GENDER = 'Expected Gender Percent (%)'; DATALINES; Male 30 Female 70 ; RUN; *Expected percent of race after resampling; DATA EXPECT_RACE; LENGTH RACE $15; INPUT RACE $ EXPECT_RACE; FORMAT EXPECT_RACE 8.2; LABEL RACE = 'Race' EXPECT_RACE = 'Expected Race Percent (%)'; DATALINES; Hispanic 30 Non-Hispanic 70 ; RUN; *Expected percent of crossed gender and race after resampling; PROC SQL NOPRINT; CREATE TABLE EXPECT_GENDER_RACE AS SELECT A.*, B.*, CATX('*', A.GENDER, B.RACE) AS CROSS_BIN LENGTH = 500 LABEL = 'Cross Bin of Gender and Race', A.EXPECT_GENDER*B.EXPECT_RACE/100 AS EXPECT_PERCENT FORMAT = 8.2 LABEL = 'Expected Percent for Each Cross Bin Level (%)', CEIL(10000*CALCULATED EXPECT_PERCENT/100) AS EXPECT_SAMPLENUM LABEL = 'Expected Sample Number for Each Cross Bin Level' FROM EXPECT_GENDER AS A, EXPECT_RACE AS B ORDER BY CALCULATED CROSS_BIN; SELECT EXPECT_SAMPLENUM INTO: EXPECT_SAMPLENUM SEPARATED BY "," FROM EXPECT_GENDER_RACE; /*Creating a variable for crossing gender and race, and ordering the data by the created variable*/ CREATE TABLE WITH_CROSS_BIN AS SELECT *, CATX('*', GENDER, RACE) AS CROSS_BIN LENGTH = 500 LABEL = 'Cross Bin of Gender and Race' FROM ORIGINAL_DATA ORDER BY CALCULATED CROSS_BIN; QUIT; *Replacement resampling; PROC SURVEYSELECT DATA = WITH_CROSS_BIN SEED = 1234 METHOD = URS OUTHITS OUT = REPLACEMENT_GENDER_RACE (DROP = CROSS_BIN) SAMPSIZE = (&EXPECT_SAMPLENUM); STRATA CROSS_BIN; RUN; *Percent of gender and race after replacement-resampling; PROC FREQ DATA = REPLACEMENT_GENDER_RACE; TABLE GENDER / OUT = RESAMPLED_GENDER_SUMMARY (DROP = COUNT); TABLE RACE / OUT = RESAMPLED_RACE_SUMMARY (DROP = COUNT); RUN; *Non-replacement resampling; PROC FREQ DATA = WITH_CROSS_BIN; TABLE CROSS_BIN / OUT = ORIGINAL_SUMMARY (RENAME = (COUNT = ORIGINAL_SAMPLENUM PERCENT = ORIGINAL_PERCENT)); RUN; PROC SQL; CREATE TABLE COMPARE AS SELECT A.*, B.EXPECT_PERCENT, B.EXPECT_PERCENT - A.ORIGINAL_PERCENT AS PERCENT_CHANGE FORMAT = 8.2 LABEL = 'Increase Percent Number after Resampling (%)' FROM ORIGINAL_SUMMARY AS A, EXPECT_GENDER_RACE AS B WHERE A.CROSS_BIN = B.CROSS_BIN; QUIT; PROC SQL; CREATE TABLE EXPECT_GENDER_RACE AS SELECT CROSS_BIN, EXPECT_PERCENT, ROUND(1100/49*EXPECT_PERCENT) AS EXPECT_SAMPLENUM LABEL = 'Expected Sample Number for Each Cross Bin Level' FROM COMPARE; QUIT; PROC SURVEYSELECT DATA = WITH_CROSS_BIN SEED = 1234 METHOD = SRS OUT = NON_REPLACEMENT_GENDER_RACE (DROP = CROSS_BIN) SAMPSIZE = (471, 1100, 202, 471); STRATA CROSS_BIN; RUN; *Percent of gender and race after non-replacement resampling; PROC FREQ DATA = NON_REPLACEMENT_GENDER_RACE; TABLE GENDER / OUT = RESAMPLED_GENDER_SUMMARY (DROP = COUNT); TABLE RACE / OUT = RESAMPLED_RACE_SUMMARY (DROP = COUNT); RUN; *Population percent in a normally distributed population, with GRP6 standing for the percent at mean, GRP5 and GRP7 for 0.5SD, GRP4 and GRP8 for SD, GRP3 and GRP9 for 1.5SD, GRP2 and GRP10 for 2SD, GRP1 and GRP11 for 2.5SD; DATA NORMAL_PERCENT; LENGTH FROM TO $25; INFILE DATALINES DLM = '$'; INPUT GRP PERCENT FROM TO; FORMAT PERCENT 8.2; LABEL GRP = 'Bin' PERCENT = 'Expect Percent for Each Bin Level (%)' FROM = 'Bin Start' TO = 'Bin End'; DATALINES; 1$1.22$Low$< Expected Mean - 2.25SD 2$2.79$Expected Mean - 2.25SD$< Expected Mean - 1.75SD 3$6.55$Expected Mean - 1.75SD$< Expected Mean - 1.25SD 4$12.1$Expected Mean - 1.25SD$< Expected Mean - 0.75SD 5$17.47$Expected Mean - 0.75SD$< Expected Mean - 0.25SD 6$19.74$Expected Mean - 0.25SD$< Expected Mean + 0.25SD 7$17.47$Expected Mean + 0.25SD$< Expected Mean + 0.75SD 8$12.1$Expected Mean + 0.75SD$< Expected Mean + 1.25SD 9$6.55$Expected Mean + 1.25SD$< Expected Mean + 1.75SD 10$2.79$Expected Mean + 1.75SD$< Expected Mean + 2.25SD 11$1.22$Expected Mean + 2.25SD$HIGH ; RUN; *Expected mean of age after resampling is 47.5; PROC FORMAT; VALUE AGEFORMAT LOW - <18.71 = AGE01 18.71 - <25.11 = AGE02 25.11 - <31.50 = AGE03 31.50 - <37.90 = AGE04 37.90 - <44.30 = AGE05 44.30 - <50.70 = AGE06 50.70 - <57.10 = AGE07 57.10 - <63.50 = AGE08 63.50 - <69.89 = AGE09 69.89 - <76.29 = AGE10 76.29 - HIGH = AGE11; RUN; *Expected percent of crossed gender, race and age after resampling; PROC SQL NOPRINT; CREATE TABLE EXPECT_GENDER_RACE_AGE AS SELECT A.*, B.*, CATS('AGE', PUT(C.GRP, Z2.)) AS AGE, C.PERCENT AS EXPECT_AGE LABEL = 'Expect Percent for Age Bin Level (%)', CATX('*', A.GENDER, B.RACE, CALCULATED AGE) AS CROSS_BIN LENGTH = 500 LABEL = 'Cross Bin of Gender, Race and Age', A.EXPECT_GENDER*B.EXPECT_RACE*C.PERCENT/100/100 AS EXPECT_PERCENT FORMAT = 8.2 LABEL = 'Expected Percent for Each Cross Bin Level (%)', ROUND(10000*CALCULATED EXPECT_PERCENT/100) AS EXPECT_SAMPLENUM LABEL = 'Expected Sample Number for Each Cross Bin Level' FROM EXPECT_GENDER AS A, EXPECT_RACE AS B, NORMAL_PERCENT AS C ORDER BY CALCULATED CROSS_BIN; SELECT EXPECT_SAMPLENUM INTO: EXPECT_SAMPLENUM SEPARATED BY "," FROM EXPECT_GENDER_RACE_AGE; CREATE TABLE WITH_CROSS_BIN AS SELECT *, CATX('*', GENDER, RACE, PUT(AGE, AGEFORMAT.)) AS CROSS_BIN LENGTH = 500 LABEL = 'Cross Bin of Gender, Race and Age' FROM ORIGINAL_DATA ORDER BY CALCULATED CROSS_BIN; QUIT; *Replacement resampling; PROC SURVEYSELECT DATA = WITH_CROSS_BIN SEED = 1234 METHOD = URS OUTHITS OUT = REPLACEMENT_GENDER_RACE_AGE SAMPSIZE = (&EXPECT_SAMPLENUM); STRATA CROSS_BIN; RUN; ODS HTML; PROC MEANS DATA = REPLACEMENT_GENDER_RACE_AGE MEAN STDDEV; VAR AGE; RUN; PROC UNIVARIATE DATA = REPLACEMENT_GENDER_RACE_AGE; VAR AGE; HISTOGRAM; RUN; ODS HTML CLOSE; ODS HTML; *Percent of gender and race after replacement-resampling; PROC FREQ DATA = REPLACEMENT_GENDER_RACE_AGE; TABLE GENDER / OUT = RESAMPLED_GENDER_SUMMARY (DROP = COUNT); TABLE RACE / OUT = RESAMPLED_RACE_SUMMARY (DROP = COUNT); RUN; *Non-replacement resampling; PROC FREQ DATA = WITH_CROSS_BIN; TABLE CROSS_BIN / OUT = ORIGINAL_SUMMARY (RENAME = (COUNT = ORIGINAL_SAMPLENUM PERCENT = ORIGINAL_PERCENT)); RUN; PROC SQL NOPRINT; CREATE TABLE COMPARE AS SELECT A.*, B.EXPECT_PERCENT, B.EXPECT_PERCENT - A.ORIGINAL_PERCENT AS PERCENT_CHANGE FORMAT = 8.2 LABEL = 'Increase Percent Number after Resampling (%)' FROM ORIGINAL_SUMMARY AS A, EXPECT_GENDER_RACE_AGE AS B WHERE A.CROSS_BIN = B.CROSS_BIN; CREATE TABLE EXPECT_GENDER_RACE_AGE AS SELECT CROSS_BIN, EXPECT_PERCENT, ORIGINAL_SAMPLENUM, CASE WHEN ROUND(95/6.66*EXPECT_PERCENT) > ORIGINAL_SAMPLENUM THEN ORIGINAL_SAMPLENUM ELSE ROUND(95/6.66*EXPECT_PERCENT) END AS EXPECT_SAMPLENUM LABEL = 'Expected Sample Number for Each Cross Bin Level' FROM COMPARE ORDER BY CROSS_BIN; SELECT EXPECT_SAMPLENUM INTO: EXPECT_SAMPLENUM SEPARATED BY "," FROM EXPECT_GENDER_RACE_AGE; QUIT; PROC SURVEYSELECT DATA = WITH_CROSS_BIN SEED = 1234 METHOD = SRS OUT = NON_REPLACEMENT_GENDER_RACE_AGE SAMPSIZE = (&EXPECT_SAMPLENUM); STRATA CROSS_BIN; RUN; PROC MEANS DATA = NON_REPLACEMENT_GENDER_RACE_AGE MEAN STDDEV; VAR AGE; RUN; PROC UNIVARIATE DATA = RESAMPLE; VAR AGE; HISTOGRAM; RUN; ODS HTML CLOSE; ODS HTML; *Percent of gender and race after non-replacement resampling; PROC FREQ DATA = RESAMPLE; TABLE GENDER / OUT = RESAMPLED_GENDER_SUMMARY (DROP = COUNT); TABLE RACE / OUT = RESAMPLED_RACE_SUMMARY (DROP = COUNT); RUN;