/***********************************************************************************
		Intellectual property of Zeming Data Science, LLC. 
			Unauthorized use might cause damage
* PROGRAM NAME : Synchronized Multivariate Resample 
* DESCRIPTION : This program demonstrate on performing synchronized multivariate
		resampling based on designated mean or percent.	PROC SURVEYSELECT is 
		applied as the core procedure.  

* PROGRAMMER : Zhiyong (John) Chen*
* DATE WRITTEN : 31 JAN 2019*
************************************************************************************/
OPTIONS MLOGIC MPRINT;
*Creating a simulated population with 50% of male and female, hispanic and non-hispanic, and 
average age around 37.5 year old;

DATA ORIGINAL_DATA (DROP = I);
	LENGTH GENDER $6 RACE $15;
	CALL STREAMINIT(3); 
	DO I = 1 TO 3000;
		GENDER = 'Male';
		IF I LE 1350 THEN RACE = 'Hispanic';
		ELSE RACE = 'Non-Hispanic';
		AGE = RAND("NORMAL", 37.5, 13);
		IF AGE < 0 THEN AGE = 1;
		WEIGHT_VAR = 2*RANUNI(1);
		OUTPUT;
	END;
	DO I = 1 TO 2000;
		GENDER = 'Female';
		IF I LE 900 THEN RACE = 'Hispanic';
		ELSE RACE = 'Non-Hispanic';
		AGE = RAND("NORMAL", 37.5, 13);
		IF AGE < 0 THEN AGE = 1;
		WEIGHT_VAR = 2*RANUNI(1);
		OUTPUT;
	END;
RUN;

PROC MEANS DATA = ORIGINAL_DATA MEAN STDDEV;
	VAR AGE;
	ODS OUTPUT SUMMARY = ORIGINAL_AGE_SUMMARY;
RUN;

PROC UNIVARIATE DATA = ORIGINAL_DATA;
	VAR AGE;
	HISTOGRAM;
RUN;
ODS HTML CLOSE;

*Percent of gender and race;
PROC FREQ DATA = ORIGINAL_DATA;
	TABLE GENDER / OUT = ORIGINAL_GENDER_SUMMARY (DROP = COUNT);
	TABLE RACE / OUT = ORIGINAL_RACE_SUMMARY (DROP = COUNT);
RUN;

**********Preparing Sample Number for Resampling**********;
*Expected percent of gender after resampling;
DATA EXPECT_GENDER;
	LENGTH GENDER $6;
	INPUT GENDER $ EXPECT_GENDER;
	FORMAT EXPECT_GENDER 8.2;
	LABEL	GENDER = 'Gender'
		EXPECT_GENDER = 'Expected Gender Percent (%)';
	DATALINES;
Male 30
Female 70
;
RUN;

*Expected percent of race after resampling;
DATA EXPECT_RACE;
	LENGTH RACE $15;
	INPUT RACE $ EXPECT_RACE;
	FORMAT EXPECT_RACE 8.2;
	LABEL	RACE = 'Race'
		EXPECT_RACE = 'Expected Race Percent (%)';
	DATALINES;
Hispanic 30
Non-Hispanic 70
;
RUN;

*Expected percent of crossed gender and race after resampling;
PROC SQL NOPRINT;
	CREATE TABLE EXPECT_GENDER_RACE AS
	SELECT	A.*, B.*, CATX('*', A.GENDER, B.RACE) AS CROSS_BIN LENGTH = 500 LABEL = 'Cross Bin of Gender and Race', 
		A.EXPECT_GENDER*B.EXPECT_RACE/100 AS EXPECT_PERCENT FORMAT = 8.2 LABEL = 'Expected Percent for Each Cross Bin Level (%)', 
		CEIL(10000*CALCULATED EXPECT_PERCENT/100) AS EXPECT_SAMPLENUM LABEL = 'Expected Sample Number for Each Cross Bin Level'
	FROM EXPECT_GENDER AS A, EXPECT_RACE AS B
	ORDER BY CALCULATED CROSS_BIN;

	SELECT EXPECT_SAMPLENUM INTO: EXPECT_SAMPLENUM SEPARATED BY ","
	FROM EXPECT_GENDER_RACE;

	/*Creating a variable for crossing gender and race, and ordering the data by the created variable*/

	CREATE TABLE WITH_CROSS_BIN AS
	SELECT *, CATX('*', GENDER, RACE) AS CROSS_BIN LENGTH = 500 LABEL = 'Cross Bin of Gender and Race'
	FROM ORIGINAL_DATA
	ORDER BY CALCULATED CROSS_BIN;
QUIT;

*Replacement resampling;
PROC SURVEYSELECT DATA = WITH_CROSS_BIN SEED = 1234 METHOD = URS OUTHITS OUT = REPLACEMENT_GENDER_RACE (DROP = CROSS_BIN)
	SAMPSIZE = (&EXPECT_SAMPLENUM);
	STRATA CROSS_BIN;
RUN;

*Percent of gender and race after replacement-resampling;
PROC FREQ DATA = REPLACEMENT_GENDER_RACE;
	TABLE GENDER / OUT = RESAMPLED_GENDER_SUMMARY (DROP = COUNT);
	TABLE RACE / OUT = RESAMPLED_RACE_SUMMARY (DROP = COUNT);
RUN;

*Non-replacement resampling;
PROC FREQ DATA = WITH_CROSS_BIN;
	TABLE CROSS_BIN / OUT = ORIGINAL_SUMMARY (RENAME = (COUNT = ORIGINAL_SAMPLENUM PERCENT = ORIGINAL_PERCENT));
RUN;

PROC SQL;
	CREATE TABLE COMPARE AS
	SELECT A.*, B.EXPECT_PERCENT, B.EXPECT_PERCENT - A.ORIGINAL_PERCENT AS PERCENT_CHANGE FORMAT = 8.2 LABEL = 'Increase Percent Number after Resampling (%)'
	FROM ORIGINAL_SUMMARY AS A, EXPECT_GENDER_RACE AS B
	WHERE A.CROSS_BIN = B.CROSS_BIN;
QUIT;

PROC SQL;
	CREATE TABLE EXPECT_GENDER_RACE AS
	SELECT CROSS_BIN, EXPECT_PERCENT, ROUND(1100/49*EXPECT_PERCENT) AS EXPECT_SAMPLENUM LABEL = 'Expected Sample Number for Each Cross Bin Level'
	FROM COMPARE;
QUIT;

PROC SURVEYSELECT DATA = WITH_CROSS_BIN SEED = 1234 METHOD = SRS OUT = NON_REPLACEMENT_GENDER_RACE (DROP = CROSS_BIN)
	SAMPSIZE = (471, 1100, 202, 471);
	STRATA CROSS_BIN;
RUN;

*Percent of gender and race after non-replacement resampling;
PROC FREQ DATA = NON_REPLACEMENT_GENDER_RACE;
	TABLE GENDER / OUT = RESAMPLED_GENDER_SUMMARY (DROP = COUNT);
	TABLE RACE / OUT = RESAMPLED_RACE_SUMMARY (DROP = COUNT);
RUN;

*Population percent in a normally distributed population, with GRP6 standing for the percent at mean, GRP5 and GRP7 
for   0.5SD, GRP4 and GRP8 for   SD, GRP3 and GRP9 for   1.5SD, GRP2 and GRP10 for   2SD, GRP1 and GRP11 for   2.5SD;
DATA NORMAL_PERCENT;
	LENGTH FROM TO $25;
	INFILE DATALINES DLM = '$';
	INPUT GRP PERCENT FROM TO;
	FORMAT PERCENT 8.2;
	LABEL	GRP		= 'Bin'
		PERCENT	= 'Expect Percent for Each Bin Level (%)' 
		FROM	= 'Bin Start'
		TO		= 'Bin End';
	DATALINES;
1$1.22$Low$< Expected Mean - 2.25SD
2$2.79$Expected Mean - 2.25SD$< Expected Mean - 1.75SD
3$6.55$Expected Mean - 1.75SD$< Expected Mean - 1.25SD
4$12.1$Expected Mean - 1.25SD$< Expected Mean - 0.75SD
5$17.47$Expected Mean - 0.75SD$< Expected Mean - 0.25SD
6$19.74$Expected Mean - 0.25SD$< Expected Mean + 0.25SD
7$17.47$Expected Mean + 0.25SD$< Expected Mean + 0.75SD
8$12.1$Expected Mean + 0.75SD$< Expected Mean + 1.25SD
9$6.55$Expected Mean + 1.25SD$< Expected Mean + 1.75SD
10$2.79$Expected Mean + 1.75SD$< Expected Mean + 2.25SD
11$1.22$Expected Mean + 2.25SD$HIGH
;
RUN;

*Expected mean of age after resampling is 47.5;
PROC FORMAT;
	VALUE AGEFORMAT
		LOW		- <18.71	= AGE01
		18.71	- <25.11	= AGE02
		25.11	- <31.50	= AGE03
		31.50	- <37.90	= AGE04 
		37.90	- <44.30	= AGE05 
		44.30	- <50.70	= AGE06
		50.70	- <57.10	= AGE07
		57.10	- <63.50	= AGE08
		63.50	- <69.89	= AGE09
		69.89	- <76.29	= AGE10
		76.29	- HIGH		= AGE11;
RUN;

*Expected percent of crossed gender, race and age after resampling;
PROC SQL NOPRINT;
	CREATE TABLE EXPECT_GENDER_RACE_AGE AS
	SELECT	A.*, B.*, CATS('AGE', PUT(C.GRP, Z2.)) AS AGE, C.PERCENT AS EXPECT_AGE LABEL = 'Expect Percent for Age Bin Level (%)',
		CATX('*', A.GENDER, B.RACE, CALCULATED AGE) AS CROSS_BIN LENGTH = 500 LABEL = 'Cross Bin of Gender, Race and Age', 
		A.EXPECT_GENDER*B.EXPECT_RACE*C.PERCENT/100/100 AS EXPECT_PERCENT FORMAT = 8.2 LABEL = 'Expected Percent for Each Cross Bin Level (%)', 
		ROUND(10000*CALCULATED EXPECT_PERCENT/100) AS EXPECT_SAMPLENUM LABEL = 'Expected Sample Number for Each Cross Bin Level'
	FROM EXPECT_GENDER AS A, EXPECT_RACE AS B, NORMAL_PERCENT AS C
	ORDER BY CALCULATED CROSS_BIN;

	SELECT EXPECT_SAMPLENUM INTO: EXPECT_SAMPLENUM SEPARATED BY ","
	FROM EXPECT_GENDER_RACE_AGE;

	CREATE TABLE WITH_CROSS_BIN AS
	SELECT *, CATX('*', GENDER, RACE, PUT(AGE, AGEFORMAT.)) AS CROSS_BIN LENGTH = 500 LABEL = 'Cross Bin of Gender, Race and Age'
	FROM ORIGINAL_DATA
	ORDER BY CALCULATED CROSS_BIN;
QUIT;

*Replacement resampling;
PROC SURVEYSELECT DATA = WITH_CROSS_BIN SEED = 1234 METHOD = URS OUTHITS OUT = REPLACEMENT_GENDER_RACE_AGE
	SAMPSIZE = (&EXPECT_SAMPLENUM);
	STRATA CROSS_BIN;
RUN;

ODS HTML;
PROC MEANS DATA = REPLACEMENT_GENDER_RACE_AGE MEAN STDDEV;
	VAR AGE;
RUN;

PROC UNIVARIATE DATA = REPLACEMENT_GENDER_RACE_AGE;
	VAR AGE;
	HISTOGRAM;
RUN;
ODS HTML CLOSE;

ODS HTML;
*Percent of gender and race after replacement-resampling;
PROC FREQ DATA = REPLACEMENT_GENDER_RACE_AGE;
	TABLE GENDER / OUT = RESAMPLED_GENDER_SUMMARY (DROP = COUNT);
	TABLE RACE / OUT = RESAMPLED_RACE_SUMMARY (DROP = COUNT);
RUN;

*Non-replacement resampling;
PROC FREQ DATA = WITH_CROSS_BIN;
	TABLE CROSS_BIN / OUT = ORIGINAL_SUMMARY (RENAME = (COUNT = ORIGINAL_SAMPLENUM PERCENT = ORIGINAL_PERCENT));
RUN;

PROC SQL NOPRINT;
	CREATE TABLE COMPARE AS
	SELECT A.*, B.EXPECT_PERCENT, B.EXPECT_PERCENT - A.ORIGINAL_PERCENT AS PERCENT_CHANGE FORMAT = 8.2 LABEL = 'Increase Percent Number after Resampling (%)'
	FROM ORIGINAL_SUMMARY AS A, EXPECT_GENDER_RACE_AGE AS B
	WHERE A.CROSS_BIN = B.CROSS_BIN;

	CREATE TABLE EXPECT_GENDER_RACE_AGE AS
	SELECT	CROSS_BIN, EXPECT_PERCENT, ORIGINAL_SAMPLENUM,
		CASE 	WHEN ROUND(95/6.66*EXPECT_PERCENT) > ORIGINAL_SAMPLENUM THEN ORIGINAL_SAMPLENUM
			ELSE ROUND(95/6.66*EXPECT_PERCENT) END AS EXPECT_SAMPLENUM LABEL = 'Expected Sample Number for Each Cross Bin Level'
	FROM COMPARE
	ORDER BY CROSS_BIN;

	SELECT EXPECT_SAMPLENUM INTO: EXPECT_SAMPLENUM SEPARATED BY ","
	FROM EXPECT_GENDER_RACE_AGE;
QUIT;

PROC SURVEYSELECT DATA = WITH_CROSS_BIN SEED = 1234 METHOD = SRS OUT = NON_REPLACEMENT_GENDER_RACE_AGE
	SAMPSIZE = (&EXPECT_SAMPLENUM);
	STRATA CROSS_BIN;
RUN;

PROC MEANS DATA = NON_REPLACEMENT_GENDER_RACE_AGE MEAN STDDEV;
	VAR AGE;
RUN;

PROC UNIVARIATE DATA = RESAMPLE;
	VAR AGE;
	HISTOGRAM;
RUN;
ODS HTML CLOSE;

ODS HTML;

*Percent of gender and race after non-replacement resampling;
PROC FREQ DATA = RESAMPLE;
	TABLE GENDER / OUT = RESAMPLED_GENDER_SUMMARY (DROP = COUNT);
	TABLE RACE / OUT = RESAMPLED_RACE_SUMMARY (DROP = COUNT);
RUN;