Splits.c

Go to the documentation of this file.
00001 
00009 #include "party.h"
00010 
00011 
00033 void C_split(const double *x, int p,
00034              const double *y, int q,
00035              const double *weights, int n,
00036              const int *orderx,
00037              SEXP splitctrl, SEXP linexpcov2sample, 
00038              SEXP expcovinf, double *cutpoint, double *maxstat, 
00039              double *statistics) {
00040 
00041     double *dExp_y, *dCov_y, *dlinstat, *dexpect, *dcovar, 
00042            tol, sweights, minprob, minbucket, w, tx, f1, f2, f1w, f2ww, tmp;
00043     double minobs, maxobs, xmax;
00044     int lastj, i, j, k;
00045 
00046     if (p != 1) error("C_split: p not equal to one");
00047     tol = get_tol(splitctrl);
00048 
00049     /* init statistics and determine the maximal value with positive weight 
00050        since we can't choose this one as cutpoint
00051     */
00052     xmax = 0.0;
00053     for (i = 0; i < n; i++) {
00054         statistics[i] = 0.0;
00055         if (weights[i] > 0.0 && x[i] > xmax) xmax = x[i];
00056     }
00057 
00058     /* we already have expecation and covariance of the response
00059      * values and the sum of the weights */
00060     dExp_y = REAL(GET_SLOT(expcovinf, PL2_expectationSym));
00061     dCov_y = REAL(GET_SLOT(expcovinf, PL2_covarianceSym));
00062     sweights = REAL(GET_SLOT(expcovinf, PL2_sumweightsSym))[0];
00063 
00064     /* if there is something to split */
00065     if (sweights > 1) {
00066 
00067         /* we need to ensure that at least minbucket weights 
00068            are there to split (either left or right) */
00069         minprob = get_minprob(splitctrl);
00070         minbucket = get_minbucket(splitctrl);
00071         minobs = sweights * minprob + 1.0;
00072 
00073         if (minobs < minbucket) 
00074             minobs = minbucket; 
00075         maxobs = sweights * (1 - minprob) - 1.0;
00076         if (maxobs > sweights - minbucket) 
00077             maxobs = sweights - minbucket; 
00078 
00079         f1 = (double) sweights / (sweights - 1);
00080         f2 = 1.0 / (sweights - 1);
00081         w = 0.0;
00082     
00083         /* pointers to the R-objects */
00084         dlinstat = REAL(GET_SLOT(linexpcov2sample, PL2_linearstatisticSym));
00085         for (k = 0; k < q; k++) dlinstat[k] = 0.0;
00086         dexpect = REAL(GET_SLOT(linexpcov2sample, PL2_expectationSym));
00087         dcovar = REAL(GET_SLOT(linexpcov2sample, PL2_covarianceSym));
00088 
00089         tx = 0.0;
00090         lastj = 0;
00091 
00092         /* for all possible cutpoints (defined by the observations x) */
00093         for (i = 0; i < (n - 1); i++) {
00094     
00095             /* the ordering of the ith observation */
00096             j = orderx[i] - 1;
00097         
00098             /* if the corresponding weight is zero */
00099             if (weights[j] == 0.0) continue;
00100 
00101             /* just a check: can be removed later */
00102             if (w > 0 && x[j] < tx)
00103                 warning("C_split: inconsistent ordering: %f < %f!\n", 
00104                         x[j], tx);
00105         
00106             /* handle ties: delete the entry of the last visited observation
00107                (take care of zero weights!) */
00108             if (w > 0 && x[j] == tx)
00109                 statistics[lastj] = 0.0; 
00110 
00111             /* store the value and position of the j smallest observation */
00112             tx = x[j];
00113             lastj = j;
00114         
00115             w += weights[j];
00116 
00117             /* do not consider those splits */
00118             if (w > maxobs || x[j] >= xmax) break;
00119 
00120             /* compute the linear statistic and expectation and 
00121              * covariance if needed */
00122             for (k = 0; k < q; k++)
00123                 dlinstat[k] += y[n * k + j] * weights[j];
00124  
00125             if (w >= minobs) {
00126                 for (k = 0; k < q; k++)
00127                     dexpect[k] = w * dExp_y[k];
00128 
00129                 f1w = f1 * w;
00130                 f2ww = f2 * w * w;
00131                 for (k = 0; k < q*q; k++)
00132                     dcovar[k] = f1w * dCov_y[k] - f2ww * dCov_y[k];
00133             } else {
00134                 continue;
00135             }
00136         
00137             /* the absolute standardized test statistic, to be maximized */
00138             /* statistics[j] = C_maxabsTestStatistic(dlinstat, 
00139                    dexpect, dcovar, q, tol); */
00140 
00141             /* much faster but uses maxabs always*/
00142             statistics[j] = 0.0;
00143             for (k = 0; k < q; k++) {
00144                 if (dcovar[k * q + k] <= tol) continue;
00145                 tmp = fabs(dlinstat[k] - dexpect[k]) / sqrt(dcovar[k * q + k]);
00146                 if (statistics[j] < tmp) statistics[j] = tmp;
00147             }
00148 
00149         }
00150     
00151         /* search for the maximum and the best separating cutpoint */
00152         maxstat[0] = 0.0;        
00153         for (i = 0; i < n; i++) {
00154             if (statistics[i] > maxstat[0]) {
00155                 maxstat[0] = statistics[i];
00156                 cutpoint[0] = x[i];
00157             }
00158         }
00159     }
00160 }
00161 
00162 
00175 SEXP R_split(SEXP x, SEXP y, SEXP weights, SEXP orderx, SEXP linexpcov2sample, 
00176              SEXP expcovinf, SEXP splitctrl) {
00177              
00178     SEXP ans, cutpoint, maxstat, statistics;
00179     
00180     PROTECT(ans = allocVector(VECSXP, 3));
00181     SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
00182     SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
00183     SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
00184     
00185     C_split(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
00186             INTEGER(orderx), splitctrl, linexpcov2sample, expcovinf,
00187             REAL(cutpoint), REAL(maxstat), REAL(statistics));
00188     UNPROTECT(1);
00189     return(ans);
00190 }
00191 
00192 
00217 void C_splitcategorical(const int *codingx, int p,
00218                         const double *y, int q,
00219                         const double *weights, int n,
00220                         double *standstat,
00221                         SEXP splitctrl, SEXP linexpcov2sample, 
00222                         SEXP expcovinf, double *cutpoint, int *levelset, 
00223                         double *maxstat, double *statistics) {
00224 
00225     double *tmpx, *tmptmpx, tmp = 0.0;
00226     int *irank, *ordertmpx, i, j, k, l, jp, chk;
00227 
00228     /* allocate memory */
00229     tmpx = Calloc(n, double);
00230     ordertmpx = Calloc(n, int);
00231     irank = Calloc(p, int);
00232     tmptmpx = Calloc(n, double);
00233 
00234     /* for all response variables (aka: dummy variables) */
00235     for (j = 0; j < q; j++) {
00236     
00237         jp = j * p;
00238 
00239         /* determine the ranking of the kth level among 
00240            the standardized statistic: This induced an ordering of the 
00241            observations */
00242         for (k = 0; k < p; k++) {
00243             irank[k] = 1;
00244             for (l = 0; l < p; l++)
00245                 if (standstat[jp + l] < standstat[jp + k]) irank[k]++;
00246         }
00247         
00248         /* a temporary response variable: the rank of the level */
00249         for (i = 0; i < n; i++) {
00250             /* <FIXME> do we have to adjust weights for missing values here??? */
00251             if (weights[i] == 0.0) {
00252                 tmpx[i] = 0.0;
00253             } else {
00254                 tmpx[i] = (double) irank[codingx[i] - 1];
00255             }
00256             /* </FIXME> */
00257             tmptmpx[i] = tmpx[i];
00258             ordertmpx[i] = i + 1;
00259         }
00260         
00261         /* order(dtmpx) */
00262         rsort_with_index(tmptmpx, ordertmpx, n);
00263 
00264         /* search for a cutpoint (now we do have an ordering) */
00265         C_split(tmpx, 1, y, q, weights, n, ordertmpx,
00266                 splitctrl, linexpcov2sample,
00267                 expcovinf, cutpoint, maxstat, statistics);
00268 
00269         /* if we have seen an improvement: save this segmentation 
00270            note: there may be splits with equal goodness */
00271         chk = 0;
00272         if (maxstat[0] > tmp) {
00273             for (k = 0; k < p; k++) {
00274                 if (irank[k] > cutpoint[0]) {
00275                     levelset[k] = 1;
00276                     chk += 1;
00277                 } else {
00278                     levelset[k] = 0;
00279                 }
00280             }
00281             tmp = maxstat[0];
00282         }
00283         /* <FIXME> make sure that at least one level goes left,
00284                    C_split may end up with cutpoint > max(irank), why?
00285            </FIXME>
00286         */
00287         /* hm, why did I added 
00288         if (chk == 0) tmp = 0.0; 
00289         ??? */
00290     }
00291     maxstat[0] = tmp;
00292 
00293     /* free memory */
00294     Free(tmpx); Free(ordertmpx); Free(irank); Free(tmptmpx);
00295 }
00296 
00297 
00311 SEXP R_splitcategorical(SEXP x, SEXP codingx, SEXP y, SEXP weights, 
00312                         SEXP linexpcov2sample, SEXP linexpcov, 
00313                         SEXP expcovinf, SEXP splitctrl) {
00314              
00315     SEXP ans, cutpoint, maxstat, statistics, levelset;
00316     double *standstat;
00317 
00318     C_LinStatExpCov(REAL(x), ncol(x), REAL(y), ncol(y), REAL(weights), nrow(x),
00319                     1, GET_SLOT(linexpcov, PL2_expcovinfSym), linexpcov);
00320 
00321     standstat = Calloc(get_dimension(linexpcov), double);
00322     C_standardize(REAL(GET_SLOT(linexpcov, PL2_linearstatisticSym)),
00323                   REAL(GET_SLOT(linexpcov, PL2_expectationSym)),
00324                   REAL(GET_SLOT(linexpcov, PL2_covarianceSym)),
00325                   get_dimension(linexpcov), get_tol(splitctrl), standstat);
00326 
00327     PROTECT(ans = allocVector(VECSXP, 4));
00328     SET_VECTOR_ELT(ans, 0, cutpoint = allocVector(REALSXP, 1));
00329     SET_VECTOR_ELT(ans, 1, maxstat = allocVector(REALSXP, 1));
00330     SET_VECTOR_ELT(ans, 2, statistics = allocVector(REALSXP, nrow(x)));
00331     SET_VECTOR_ELT(ans, 3, levelset = allocVector(INTSXP, ncol(x)));
00332     
00333     C_splitcategorical(INTEGER(codingx), ncol(x), REAL(y), ncol(y), REAL(weights), 
00334                        nrow(x), standstat, 
00335                        splitctrl, linexpcov2sample, expcovinf, 
00336                        REAL(cutpoint), INTEGER(levelset), REAL(maxstat), 
00337                        REAL(statistics));
00338 
00339     UNPROTECT(1);
00340     Free(standstat);
00341     return(ans);
00342 }