Bisquare-Weighted Anova (bANOVA)
/* This block reads the data from NUMBER.DAT into a SAS data file called RAWDATA and sets up a variable named DIFF. RAWDATA will contain 3 variables: IVA, DVA, and DIFF. */
DATA RAWDATA;
/* Names the SAS data set that will be created RAWDATA. */
INFILE 'NUMBER.DAT';
/* Tells SAS that the data are in a file named NUMBER.DAT (VMS specific). */
INPUT IVA DVA;
/* Indicates to SAS to use the first column of values from NUMBER.DAT as the independent variable values and the second column of values as the dependent variable values. Names these IVA and DVA. */
DIFF = 0;
/* DIFF will be the difference between the mean of the DVA and the bisquare weighted average (see below). It is set to zero for now. */
OUTPUT;
/* SAS puts these values into RAWDATA. */
RUN;
/* Forces the execution of the DATA step. */
/* This procedure sets up a data set called EXTEND from the data contained in RAWDATA and MEDIANS. MEDIANS contains the median values for each group. EXTEND will contain 5 variables: IVA, DVA, DIFF, the group medians (MEDDVA) and the absolute median deviation for each group (ABDDVA). */
OPTIONS LINESIZE = 80;
/* Displays the output as 80 characters wide. */
PROC UNIVARIATE DATA = RAWDATA NOPRINT;
/* Provides descriptives (such as the mean and median) for the data set RAWDATA. */
VAR DVA;
/* DVA is to be analyzed from the data set. */
BY IVA;
/* SAS executes this step for each group defined by IVA. */
OUTPUT OUT = MEDIANS MEDIAN = MEDDVA;
/* Puts the results into a file called MEDIANS and names the median MEDDVA. */
RUN;
/* Forces the execution of the PROC command. */
DATA EXTEND;
/* Creates a SAS data file called EXTEND. */
MERGE RAWDATA MEDIANS;
/* EXTEND contains the data from RAWDATA and MEDIANS. */
BY IVA;
/* Arranges the data by the IVA. */
IF _N_ = 1 THEN IVAOLD = 0;
/* These two IFs extend the number of groups in MEDIANS to match the number of subjects in RAWDATA. I.e., if MEDIANS has p groups and RAWDATA has 30 subjects/group, then EXTEND will have 120 rows of data. */
IF IVAOLD ^= IVA THEN DO;
IVAOLD = IVA;
MED = MEDDVA;
END;
/* Ends the IF, THEN loop. */
ABDDVA = ABS(DVA -MED);
/* ABDDVA is the absolute value of the deviation of DVA from the median (MED). */
RETAIN MEDDVA;
/* Tells SAS to keep the new medians. */
DROP IVAOLD MED;
/* Tells SAS to drop the old IVAs and the medians. */
OUTPUT;
/* SAS puts these values into EXTEND. */
RUN;
/* Forces the execution of the DATA step. */
PROC DATASETS NOLIST;
/* Deletes the MEDIANS data file. */
DELETE MEDIANS;
RUN;
/* Forces the execution of the PROC step. */
/* This block sets up a data set called TEST from the data contained in RAWDATA and MEDABDEV. MEDABDEV contains the median of the absolute deviation from the median (MAD2DVA). TEST will contain 5 variables: IVA, DVA, DIFF, the group medians (MED2DVA) and the medians of the absolute median deviation for each group (MAD2DVA). */
PROC UNIVARIATE DATA = EXTEND NOPRINT;
/* Gives descriptives on EXTEND and suppresses the printing. */
VAR ABDDVA DVA;
/* ABDDVA and DVA are to be analyzed from the EXTEND data set. */
BY IVA;
/* SAS executes this step for each group defined by the IVA. */
OUTPUT OUT = MEDABDEV MEDIAN = MAD2DVA MED2DVA;
/* Puts the results into a file called MEDABDEV and names the median of ABDDVA to MAD2DVA and the median of DVA to MED2DVA */
RUN;
/* Forces the execution of the PROC step. */
PROC DATASETS NOLIST;
/* Deletes the EXTEND data file. */
DELETE EXTEND;
RUN;
/* Forces the execution of the PROC step. */
DATA TEST;
/* Creates a SAS data file called TEST that has the data from RAWDATA and MEDABDEV. */
MERGE RAWDATA MEDABDEV;
BY IVA;
/* Arranges the data by the IVA. */
IF _N_ = 1 THEN IVAOLD = 0;
/* This procedure extends the number of groups in MEDABDEV to match the number of subjects in RAWDATA. */
IF IVAOLD ^= IVA THEN DO;
IVAOLD = IVA;
/* Renames IVA to IVAOLD. */
MAD = MAD2DVA;
/* Renames MAD2DVA to MAD. */
MEDKY = MED2DVA;
/* Renames MED2DVA to MEDKY. */
END;
/* Ends the IF, THEN loop. */
RETAIN MAD MEDKY;
/* Tells SAS to keep MAD and MEDKY. */
DROP MAD2DVA MED2DVA IVAOLD;
/* Tells SAS to drop MAD2DVA, MED2DVA, and IVAOLD. */
OUTPUT;
/* SAS puts these values into TEST. */
RUN;
/* Forces the execution of the DATA step. */
PROC DATASETS NOLIST;
/* Deletes the RAWDATA and MEDABDEV files. */
DELETE RAWDATA MEDABDEV;
RUN;
/* Forces the execution of the PROC step. */
/* The following block calculates a bisquare weighted average for each cell and outputs the weights for each element in the cell into a SAS data set called WEIGHTS. WEIGHTS will contain 6 variables: IVA, DVA, DIFF, MED2DVA, MAD2DVA, and WS. The weights will later be used to calculate the weighted ANOVA. */
PROC NLIN DATA= TEST NOHALVE;
/* Fits a nonlinear regression model using the least squares procedure. NOHALVE turns off the step-size search during iteration. (See Example 5; SAS Institute, 1990b, p. 1165.) */
TITLE 'Tukey biweight';
/* Title for this section. */
BY IVA;
/* SAS executes this step for each group defined by IVA. */
PARMS B = 1;
/* B represents the bisquare weighted average. It is nominally set to 1.0 */
IF _ITER_=0 AND _N_=1 THEN B = MEDKY;
/* For the first iteration B is initialized to the median value. */
MODEL DIFF = (DVA - B);
/* The model is set as the difference (DIFF) between the DVA and B. */
DER.B = -1;
/* The derivative of the model with respect to B is -1. */
R = 4;
/* Defines the rejection point as 4 scale units beyond the middle of the distribution. */
SCALE = 1.483 * MAD;
/* SCALE is the unit of measurement for the residuals. MAD is the measure of scale. MAD times 1.483 provides an unbiased estimate of the standard deviation when sampling from a normal distribution (Hampel et al., 1986, Ch. 2). */
RESID = (DIFF - MODEL.DIFF)/SCALE;
/* MODEL.DIFF is the estimated deviation of B from DVA at a given iteration. DIFF is the data point that is estimated. RESID provides a scaled normalized estimate of the residuals. */
IF ABS (RESID) < R THEN PSYS = RESID*((1 - (RESID/R)**2)**2);
/* If the absolute value of the residual is within 4 scale units of the middle of the distribution (R), then the case receives a non-zero weight using the following influence function (Mosteller Tukey, 1977, p. 205). The influence function is called the psy function (Hampel et al., 1986, Ch. 2). */
ELSE PSYS = 0.;
/* If the absolute value of the residual is outside of 4 scale units of the middle of the distribution (R), then the case is given a weight of zero. */
IF RESID ^= 0 THEN WS = PSYS / RESID;
/* If the value of the residual is not equal to zero, then the weight for the case equals the influence function value divided by the residual. */
ELSE WS = 1.;
/* If the residual equals zero, then the weight for that case equals 1.0 (B is at the center of the distribution.) */
_WEIGHT_ = WS;
/* Replaces SAS's NLIN weight value with the weight value calculated from the biweight procedure. */
ID WS;
/* Specifies that the variable WS will be output to a SAS data set. */
OUTPUT OUT=WEIGHTS;
/* Puts results into a SAS data set called WEIGHTS. */
RUN;
/* Executes the above procedure. */
PROC DATASETS NOLIST;
/* Deletes the TEST data file. */
DELETE TEST;
RUN;
/* Forces the execution of the PROC step. */
PROC PRINT DATA = WEIGHTS;
/* Prints the data set with the weights for outlier identification*/
RUN;
/* The following block calculates the weighted ANOVA from the weights obtained from the previous procedure. The results are output into the file ANOVA which will contain 7 variables: NAME (DVA), SOURCE (IVA and Error) TYPE (IVA and SS), DF, SS, F, and PROB. */
PROC GLM DATA = WEIGHTS OUTSTAT = ANOVA NOPRINT;
/* Uses the General Linear Model to calculate an ANOVA on the WEIGHTS data set and puts the results into ANOVA. */
CLASS IVA;
/* Classifies IVA as the variable of interest. */
MODEL DVA = IVA;
/* Defines DVA as the dependent variable and IVA as the independent variable. */
WEIGHT WS;
/*Indicates that the DVA will be weighted using the weights (WS) obtained in the above procedure when calculating the ANOVA. */
RUN;
/* Forces the execution of the GLM step. */
PROC DATASETS NOLIST;
/* Deletes WEIGHTS. */
DELETE WEIGHTS;
RUN;
/* Forces the execution of the PROC step. */
/* This block calculates the F value for the weighted ANOVA. It also calculates the approximate F value (APPF) based on the regression equation in footnote 3 and the corresponding probability value (PROBFA). These values are listed in OUTF: NAME (DVA), SOURCE (IVA), TYPE (SS1), DF, SS, F, PROB, DFERROR, APPF, PROBFA. */
DATA OUTF;
/* Creates SAS file called OUTF. */
SET ANOVA;
/* Uses the ANOVA file. */
IF _TYPE_ = 'ERROR' THEN DO;
/* Labels the degrees of freedom for the error term in the ANOVA file DFERROR. */
DFERROR = DF;
END;
/* Ends the IF, THEN loop. */
RETAIN DFERROR;
/* Tells SAS to keep the DFERROR term. */
IF _TYPE_ = 'SS1' THEN DO;
/* Calculates an approximate F value (APPF based on the regression equation.) */
APPF = (.534 +(.001206 * DFERROR)) * F;
/* The regression equation for the approximate F. */
PROBFA = 1 - PROBF (APPF, DF, DFERROR, 0);
/* Finds the probability value for the APPF value and puts the output into OUTF. */
OUTPUT;
END;
/* Ends the IF, THEN loop. */
RUN;
/* Forces the execution of the DATA step. */
PROC PRINT DATA = OUTF;
/* Prints OUTF. */
RUN;
/* Forces the execution of the PROC step. */