r - C5.0 包：粘贴错误(apply(x, 1, paste, collapse = ","), collapse = "\n") : 结果将超过 2^31-1 字节

Question

尝试使用 C5.0 CRAN 包训练具有大约 300 万行和 600 列数据集的模型时，出现以下错误：

粘贴错误(apply(x, 1, paste, collapse = ","), collapse = "\n") : 结果将超过 2^31-1 字节

从存储库所有者对类似问题的回答来看，这是由于字符串中字节数的R 限制，限制为 2^31 - 1。

score 3 · Accepted Answer

前面的答案很长：

因此，如问题所述，错误发生在C5.0中使用的 Cubist包中makeDataFile函数的最后一行，它将所有行连接成一个字符串。由于需要这个字符串来将数据传递给 C 中的 C5.0 函数，但不需要在 R 中进行任何操作，并且 C 除了机器本身的内存限制之外没有内存限制，我采取的方法是而是在 C 中创建这样的字符串。为了做到这一点，R 代码将在一个字符向量中传递信息，该字符向量包含不超过长度限制的各种字符串，而不是一个，这样一旦在 C 中，这些元素就可以连接起来。

但是，我发现 strcat 函数非常慢，因此我选择创建另一个 R 函数（create_max_len_strings），而不是将所有行作为字符向量中的单独元素在循环中使用 strcat 连接起来在不达到内存限制的情况下将行连接成最长（~或接近~）的字符串，因此只需应用几次 strcat 即可连接这些较长的字符串。

因此，原始 makeDataFile() 函数的最后一行将被替换，以便将每一行单独保留为字符向量的一个元素，仅在每个字符串行的末尾添加一个换行符，以便在连接其中一些元素时使用 create_max_len_strings() 将它们分成更长的字符串，它们将被区分：

生成数据文件.R：

create_max_len_strings <- function(original_vector) {

  vector_length = length(original_vector)
  
  nchars = sum(nchar(original_vector, type = "chars"))
  
  ## Check if the length of the string would reach 1900000000, which is close to the memory limitation
  if(nchars >= 1900000000){
  
    ## Calculate how many strings we could create of the maximum length 
    nchunks = 0
    while(nchars > 0){
      nchars = nchars - 1900000000
      nchunks = nchunks + 1
    }
    
    ## Get the number of rows that would be contained in each string
    chunk_size = vector_length/nchunks
    
    ## Get the rounded number of rows in each string
    chunk_size = floor(chunk_size)
    index = chunk_size
    
    ## Create a vector with the indexes of the rows that delimit each string
    indexes_vector = c()  
    indexes_vector = append(indexes_vector, 0)
    
    n = nchunks
    while(n > 0){
      indexes_vector = append(indexes_vector, index)
      index = index + chunk_size
      n = n - 1
    }
    
    ## Get the last few rows if the division had remainder 
    remainder = vector_length %% nchunks
    if (remainder != 0){
      indexes_vector = append(indexes_vector, vector_length)
      nchunks = nchunks + 1
    }
    
    ## Create the strings pasting together the rows from the indexes in the indexes vector
    strings_vector = c()
    i = 2
    while (i <= length(indexes_vector)){
      ## Sum 1 to the index_init so that the next string does not contain the last row of the previous string
      index_init = indexes_vector[i-1] + 1
      index_end = indexes_vector[i]
      
      ## Paste the rows from the vector from index_init to index_end
      string <- paste0(original_vector[index_init:index_end], collapse="")
      ## Create vector containing the strings that were created 
      strings_vector <- append(strings_vector, string)
      i = i + 1
    }
    
  }else {
    strings_vector = paste0(original_vector, collapse="")
  }
  
  strings_vector
}

makeDataFile <- function(x, y, w = NULL) {

    ## Previous code stays the same
    ...

    x = apply(x, 1, paste, collapse = ",")  
    x = paste(x, "\n", sep="")
    char_vec = create_max_len_strings(x)
}

调用 C5.0

现在，为了创建最终字符串以传递给 C 中的 c50() 函数，创建并调用了一个中间函数。为了做到这一点，在 R 中调用 c50() 的 .C() 语句被替换为调用此函数的 .Call() 语句，因为 .Call() 允许将向量等复杂对象传递给 C。此外，它允许在变量result中返回结果，而不必通过引用传回变量tree、rules和output。调用 C5.0 的结果将在字符向量结果中接收，其中包含与树、规则和前三个位置的输出对应的字符串：

C5.0.R：

C5.0.default <- function(x,
                         y,
                         trials = 1,
                         rules = FALSE,
                         weights = NULL,
                         control = C5.0Control(),
                         costs = NULL,
                         ...) {

 ## Previous code stays the same
...

dataString <- makeDataFile(x, y, weights)
num_chars = sum(nchar(dataString, type = "chars"))

result <- .Call(
    "call_C50",
    as.character(namesString),
    dataString,
    as.character(num_chars), ## The length of the resulting string is passed as character because it is too long for an integer
    as.character(costString),
    as.logical(control$subset),
    # -s "use the Subset option" var name: SUBSET
    as.logical(rules),
    # -r "use the Ruleset option" var name: RULES

    ## for the bands option, I'm not sure what the default should be.
    as.integer(control$bands),
    # -u "sort rules by their utility into bands" var name: UTILITY

    ## The documentation has two options for boosting:
    ## -b use the Boosting option with 10 trials
    ## -t trials ditto with specified number of trial
    ## I think we should use -t
    as.integer(trials),
    # -t : " ditto with specified number of trial", var name: TRIALS

    as.logical(control$winnow),
    # -w "winnow attributes before constructing a classifier" var name: WINNOW
    as.double(control$sample),
    # -S : use a sample of x% for training
    #      and a disjoint sample for testing var name: SAMPLE
    as.integer(control$seed),
    # -I : set the sampling seed value
    as.integer(control$noGlobalPruning),
    # -g: "turn off the global tree pruning stage" var name: GLOBAL
    as.double(control$CF),
    # -c: "set the Pruning CF value" var name: CF

    ## Also, for the number of minimum cases, I'm not sure what the
    ## default should be. The code looks like it dynamically sets the
    ## value (as opposed to a static, universal integer
    as.integer(control$minCases),
    # -m : "set the Minimum cases" var name: MINITEMS

    as.logical(control$fuzzyThreshold),
    # -p "use the Fuzzy thresholds option" var name: PROBTHRESH
    as.logical(control$earlyStopping)
  )

## Get the first three positions of the character vector that contain the tree, rules and output returned by C5.0 in C

result_tree = result[1]
result_rules = result[2]
result_output = result[3]

modelContent <- strsplit(
    if (rules)
      result_rules
    else
      result_tree, "\n"
  )[[1]]
  entries <- grep("^entries", modelContent, value = TRUE)
  if (length(entries) > 0) {
    actual <- as.numeric(substring(entries, 10, nchar(entries) - 1))
  } else
    actual <- trials

  if (trials > 1) {
    boostResults <- getBoostResults(result_output)
    ## This next line is here to avoid a false positive warning in R
    ## CMD check:
    ## * checking R code for possible problems ... NOTE
    ## C5.0.default: no visible binding for global variable 'Data'
    Data <- NULL
    size <-
      if (!is.null(boostResults))
        subset(boostResults, Data == "Training Set")$Size
    else
      NA
  }   else {
    boostResults <- NULL
    size <- length(grep("[0-9])$", strsplit(result_output, "\n")[[1]]))
  }

  out <- list(
    names = namesString,
    cost = costString,
    costMatrix = costs,
    caseWeights = !is.null(weights),
    control = control,
    trials = c(Requested = trials, Actual = actual),
    rbm = rules,
    boostResults = boostResults,
    size = size,
    dims = dim(x),
    call = funcCall,
    levels = levels(y),
    output = result_output,
    tree = result_tree,
    predictors = colnames(x),
    rules = result_rules
  )

  class(out) <- "C5.0"
  out
}

现在进入 C 代码，函数 call_c50() 基本上充当 R 代码和 C 代码之间的中间体，连接 dataString 数组中的元素以获得 C 函数c50()所需的字符串，通过访问每个位置使用 CHAR(STRING_ELT(x, i)) 的数组并将它们连接在一起（strcat）。然后将其余变量强制转换为它们各自的类型，并调用文件 top.c 中的 c50() 函数（该函数也应该放置在其中）。调用 c50() 的结果将通过创建一个字符向量并将与tree、rules和output对应的字符串放置在每个位置来返回给 R 例程。

最后，c50() 函数基本上保持原样，除了变量treev、rulesv和outputv，因为这些值将由 .Call() 返回而不是通过引用传递，它们不再需要在函数的参数中。由于它们都是字符串，因此可以通过将每个字符串设置到数组c50_return中的位置来在单个数组中返回它们。

顶部.c：

SEXP call_C50(SEXP namesString, SEXP data_vec, SEXP datavec_len, SEXP costString, SEXP subset, SEXP rules, SEXP bands, SEXP trials, SEXP winnow, SEXP sample, 
SEXP seed, SEXP noGlobalPruning, SEXP CF, SEXP minCases, SEXP fuzzyThreshold, SEXP earlyStopping){

  char* string;
  char* concat;
  long n = 0;
  long size;
  int i;
  char* eptr;
  
  // Get the length of the data vector
  n = length(data_vec);
  
  // Get the string indicating the length of the final string
  char* size_str = malloc((strlen(CHAR(STRING_ELT(datavec_len, 0)))+1)*sizeof(char)); 
  strcpy(size_str, CHAR(STRING_ELT(datavec_len, 0)));
  
  // Turn the string to long
  size = strtol(size_str, &eptr, 10);
  
   // Allocate memory for the number of characters indicated by datavec_len
  string = malloc((size+1)*sizeof(char));
  
  // Copy the first element of data_vec into the string variable
  strcpy(string, CHAR(STRING_ELT(data_vec, 0)));
   
   // Loop over the data vector until all elements are concatenated in the string variable
  for (i = 1; i < n; i++) {
    strcat(string, CHAR(STRING_ELT(data_vec, i)));
  }
  
  // Copy the value of namesString into a char*
  char* namesv = malloc((strlen(CHAR(STRING_ELT(namesString, 0)))+1)*sizeof(char)); 
  strcpy(namesv, CHAR(STRING_ELT(namesString, 0)));
  
   // Copy the value of costString into a char*
  char* costv = malloc((strlen(CHAR(STRING_ELT(costString, 0)))+1)*sizeof(char)); 
  strcpy(costv, CHAR(STRING_ELT(costString, 0)));
  
  // Call c50() function casting the rest of arguments into their respective C types
  char** c50_return = c50(namesv, string, costv, asLogical(subset), asLogical(rules), asInteger(bands), asInteger(trials), asLogical(winnow), asReal(sample), asInteger(seed), asInteger(noGlobalPruning), asReal(CF), asInteger(minCases), asLogical(fuzzyThreshold), asLogical(earlyStopping));
  
  free(string);
  free(namesv);
  free(costv);
  
  // Create a character vector to be returned to the C5.0 R function
  SEXP out = PROTECT(allocVector(STRSXP, 3));

  SET_STRING_ELT(out, 0, mkChar(c50_return[0]));
  SET_STRING_ELT(out, 1, mkChar(c50_return[1])); 
  SET_STRING_ELT(out, 2, mkChar(c50_return[2]));

  UNPROTECT(1);

  return out;
}

static char** c50(char *namesv, char *datav, char *costv, int subset,
                int rules, int utility, int trials, int winnow,
                double sample, int seed, int noGlobalPruning, double CF,
                int minCases, int fuzzyThreshold, int earlyStopping) {
  int val; /* Used by setjmp/longjmp for implementing rbm_exit */
  
  char ** c50_return = malloc(3 * sizeof(char*));

  // Initialize the globals to the values that the c50
  // program would have at the start of execution
  initglobals();
  
  // Set globals based on the arguments.  This is analogous
  // to parsing the command line in the c50 program.
  setglobals(subset, rules, utility, trials, winnow, sample, seed,
             noGlobalPruning, CF, minCases, fuzzyThreshold, earlyStopping,
             costv);
            
  // Handles the strbufv data structure
  rbm_removeall();

  // Deallocates memory allocated by NewCase.
  // Not necessary since it's also called at the end of this function,
  // but it doesn't hurt, and I'm feeling paranoid.
  FreeCases();
  
  // XXX Should this be controlled via an option?
  // Rprintf("Calling setOf\n");
  setOf();

  // Create a strbuf using *namesv as the buffer.
  // Note that this is a readonly strbuf since we can't
  // extend *namesv.
  STRBUF *sb_names = strbuf_create_full(namesv, strlen(namesv))

  // Register this strbuf using the name "undefined.names"
  if (rbm_register(sb_names, "undefined.names", 0) < 0) {
    error("undefined.names already exists");
  }

  // Create a strbuf using *datav and register it as "undefined.data"
  STRBUF *sb_datav = strbuf_create_full(datav, strlen(datav));
  // XXX why is sb_datav copied? was that part of my debugging?
  // XXX or is this the cause of the leak?
  if (rbm_register(strbuf_copy(sb_datav), "undefined.data", 0) < 0) {
    error("undefined data already exists");
  }

  // Create a strbuf using *costv and register it as "undefined.costs"
  if (strlen(costv) > 0) {
    // Rprintf("registering cost matrix: %s", *costv);
    STRBUF *sb_costv = strbuf_create_full(costv, strlen(costv));
    // XXX should sb_costv be copied?
    if (rbm_register(sb_costv, "undefined.costs", 0) < 0) {
      error("undefined.cost already exists");
    }
  } else {
    // Rprintf("no cost matrix to register\n");
  }

  /*
   * We need to initialize rbm_buf before calling any code that
   * might call exit/rbm_exit.
   */
  if ((val = setjmp(rbm_buf)) == 0) {

    // Real work is done here
    c50main();

    if (rules == 0) {
      // Get the contents of the the tree file
      STRBUF *treebuf = rbm_lookup("undefined.tree");
      if (treebuf != NULL) {
        char *treeString = strbuf_getall(treebuf);
        
        c50_return[0] = R_alloc(strlen(treeString) + 1, 1);
        strcpy(c50_return[0], treeString);
        
        c50_return[1] = "";
        
      } else {
        // XXX Should *treev be assigned something in this case?
        // XXX Throw an error?
      }
    } else {
      // Get the contents of the the rules file
      STRBUF *rulesbuf = rbm_lookup("undefined.rules");
      if (rulesbuf != NULL) {
        char *rulesString = strbuf_getall(rulesbuf);
        
        c50_return[1] = R_alloc(strlen(rulesString) + 1, 1);
        strcpy(c50_return[1], rulesString);
        
        c50_return[0] = "";
       
      } else {
        // XXX Should *rulesv be assigned something in this case?
        // XXX Throw an error?
      }
    }
  } else {
    Rprintf("c50 code called exit with value %d\n", val - JMP_OFFSET);
  }

  // Close file object "Of", and return its contents via argument outputv
  char *outputString = closeOf();
  
  c50_return[2] = R_alloc(strlen(outputString) + 1, 1);
  strcpy(c50_return[2], outputString);

  // Deallocates memory allocated by NewCase
  FreeCases();

  // We reinitialize the globals on exit out of general paranoia
  initglobals();
  
  return c50_return;  
}

***重要提示：如果创建的字符串长于 2147483647，您还需要更改 strbuf.c 中函数 strbuf_gets() 中变量 i 和 j 的定义。这个函数基本上遍历字符串的每个位置，因此试图将它们的值增加到超过 INT 限制以访问数组中的这些位置将导致分段错误。我建议将声明类型更改为 long 以避免此问题。

C5.0 预测

但是，由于 makeDataFile 函数不仅用于创建模型，还用于将数据传递给predictions()函数，因此也必须修改此函数。就像之前一样，predict.C5.0()中用于调用 predictions() 的 .C() 语句将被替换为 .Call() 语句，以便能够将字符向量传递给 C，然后将结果将在结果变量中返回，而不是通过引用传递：

预测.C5.0.R：

  predict.C5.0 <- function (object,
            newdata = NULL,
            trials = object$trials["Actual"],
            type = "class",
            na.action = na.pass,
            ...)  {

    ## Previous code stays the same
    ...

    caseString <- makeDataFile(x = newdata, y = NULL)
  
    num_chars = sum(nchar(caseString, type = "chars"))
    

    ## When passing trials to the C code, convert to
    ## zero if the original version of trials is used

    if (trials <= 0)
      stop("'trials should be a positive integer", call. = FALSE)
    if (trials == object$trials["Actual"])
      trials <- 0
      
    ## Add trials (not object$trials) as an argument
    results <- .Call(
      "call_predictions",
      caseString,
      as.character(num_chars),
      as.character(object$names),
      as.character(object$tree),
      as.character(object$rules),
      as.character(object$cost),
      pred = integer(nrow(newdata)),
      confidence = double(length(object$levels) * nrow(newdata)),
      trials = as.integer(trials)
    )
    
    
    predictions = as.numeric(unlist(results[1]))
    confidence = as.numeric(unlist(results[2]))
    output = as.character(results[3])    
    
    if(any(grepl("Error limit exceeded", output)))
      stop(output, call. = FALSE)

    if (type == "class") {
      out <- factor(object$levels[predictions], levels = object$levels)
    } else {
      out <-
        matrix(confidence,
               ncol = length(object$levels),
               byrow = TRUE)
      if (!is.null(rownames(newdata)))
        rownames(out) <- rownames(newdata)
      colnames(out) <- object$levels
    }

out
}

在文件 top.c 中，predictions()函数将被修改为接收 .Call() 语句传递的变量，这样就像之前一样，caseString 数组将被连接成一个字符串，其余的变量铸造成各自的类型。在这种情况下，变量pred和confidence也将作为整数和双精度类型的向量接收，因此需要将它们转换为 int* 和 double*。函数的其余部分保持原样，以便创建预测，结果变量predv、confidencev和输出变量将分别放置在向量的前三个位置。

顶部.c：

SEXP call_predictions(SEXP caseString, SEXP case_len, SEXP names, SEXP tree, SEXP rules, SEXP cost, SEXP pred, SEXP confidence, SEXP trials){

  char* casev;
  char* outputv = "";
  char* eptr;
  
  char* size_str = malloc((strlen(CHAR(STRING_ELT(case_len, 0)))+1)*sizeof(char)); 
  strcpy(size_str, CHAR(STRING_ELT(case_len, 0)));
  
  long size = strtol(size_str, &eptr, 10);
  
  casev = malloc((size+1)*sizeof(char));
  strcpy(casev, CHAR(STRING_ELT(caseString, 0)));
  
  int n = length(caseString);
  
  for (int i = 1; i < n; i++) {
    strcat(casev, CHAR(STRING_ELT(caseString, i)));
  }
  
  char* namesv = malloc((strlen(CHAR(STRING_ELT(names, 0)))+1)*sizeof(char)); 
  strcpy(namesv, CHAR(STRING_ELT(names, 0)));
  
  char* treev = malloc((strlen(CHAR(STRING_ELT(tree, 0)))+1)*sizeof(char)); 
  strcpy(treev, CHAR(STRING_ELT(tree, 0)));
  
  char* rulesv = malloc((strlen(CHAR(STRING_ELT(rules, 0)))+1)*sizeof(char)); 
  strcpy(rulesv, CHAR(STRING_ELT(rules, 0)));
  
  char* costv = malloc((strlen(CHAR(STRING_ELT(cost, 0)))+1)*sizeof(char)); 
  strcpy(costv, CHAR(STRING_ELT(cost, 0)));
  
  int variable;
  int* predv = &variable;
  
  int npred = length(pred);
  predv = malloc((npred+1)*sizeof(int));
  
  for (int i = 0; i < npred; i++) {
    predv[i] = INTEGER(pred)[i];
  }
  
  double variable1;
  double* confidencev = &variable1;
  
  int nconf = length(confidence);
  confidencev = malloc((nconf+1)*sizeof(double));
  
  for (int i = 0; i < nconf; i++) {
    confidencev[i] = REAL(confidence)[i];
  }
  
  int* trialsv = &variable;
  *trialsv = asInteger(trials);

  /* Original code for predictions starts */
  
  int val; 

  // Announce ourselves for testing
  // Rprintf("predictions called\n");

  // Initialize the globals
  initglobals();

  // Handles the strbufv data structure
  rbm_removeall();

  // XXX Should this be controlled via an option?
  // Rprintf("Calling setOf\n");
  setOf();

  STRBUF *sb_cases = strbuf_create_full(casev, strlen(casev));
  if (rbm_register(sb_cases, "undefined.cases", 0) < 0) {
    error("undefined.cases already exists");
  }

  STRBUF *sb_names = strbuf_create_full(namesv, strlen(namesv));
  if (rbm_register(sb_names, "undefined.names", 0) < 0) {
    error("undefined.names already exists");
  }

  if (strlen(treev)) {
    STRBUF *sb_treev = strbuf_create_full(treev, strlen(treev));
    
    if (rbm_register(sb_treev, "undefined.tree", 0) < 0) {
      error("undefined.tree already exists");
    }
  } else if (strlen(rulesv)) {
    STRBUF *sb_rulesv = strbuf_create_full(rulesv, strlen(rulesv));
    
    if (rbm_register(sb_rulesv, "undefined.rules", 0) < 0) {
      error("undefined.rules already exists");
    }
    setrules(1);
  } else {
    error("either a tree or rules must be provided");
  }

  // Create a strbuf using *costv and register it as "undefined.costs"
  if (strlen(costv) > 0) {
    // Rprintf("registering cost matrix: %s", *costv);
    STRBUF *sb_costv = strbuf_create_full(costv, strlen(costv));
    // XXX should sb_costv be copied?
    if (rbm_register(sb_costv, "undefined.costs", 0) < 0) {
      error("undefined.cost already exists");
    }
  } else {
    // Rprintf("no cost matrix to register\n");
  }

  
  if ((val = setjmp(rbm_buf)) == 0) {
    // Real work is done here
    // Rprintf("\n\nCalling rpredictmain\n");
    rpredictmain(trialsv, predv, confidencev);

    // Rprintf("predict finished\n\n");
  } else {
    // Rprintf("predict code called exit with value %d\n\n", val - JMP_OFFSET);
  }

  // Close file object "Of", and return its contents via argument outputv
  char *outputString = closeOf();
  char *output = R_alloc(strlen(outputString) + 1, 1);
  strcpy(output, outputString);

  // We reinitialize the globals on exit out of general paranoia
  initglobals();


  /* Original code for predictions ends */
  
  free(namesv);
  free(treev);
  free(rulesv);
  free(costv);
  
  SEXP predx = PROTECT(allocVector(INTSXP, npred));
  
  for (int i = 0; i < npred; i++) {
    INTEGER(predx)[i] = predv[i];
  }
  
  SEXP confidencex = PROTECT(allocVector(REALSXP, nconf));
  
  for (int i = 0; i < npred; i++) {
    REAL(confidencex)[i] = confidencev[i];
  }
  
  SEXP outputx = PROTECT(allocVector(STRSXP, 1));
  SET_STRING_ELT(outputx, 0, mkChar(output));

  SEXP vector = PROTECT(allocVector(VECSXP, 3));
  SET_VECTOR_ELT(vector, 0, predx);
  SET_VECTOR_ELT(vector, 1, confidencex);
  SET_VECTOR_ELT(vector, 2, outputx);

  UNPROTECT(4);
  
  free(predv);
  free(confidencev);
  
  return vector;
}

r - C5.0 包：粘贴错误(apply(x, 1, paste, collapse = ","), collapse = "\n") : 结果将超过 2^31-1 字节

1 回答 1

调用 C5.0

C5.0 预测

Related

Reference