1

我要在一个相对较大的数据集约 100 万行的多个时间范围内计算许多不同的中心性和传播指标。我已经进行了多次不同的尝试,但是我最终得到的算法对于我的目的来说仍然太慢了。

这是我当前的迭代:

ts_rollapply <- function(COI, DATE_COL, FUN, n, unit = c("day", "week", "month", "year"), verbose = FALSE, ...) {
  # Initiate Variables
  APPLY_FUNC <- match.fun(FUN = FUN)
  LAST_DATE <- last_date(DATE_COL, n = n, unit = match.arg(unit))
  result <- vector(mode = "numeric", length = length(COI))

  for(i in seq_along(COI)) {
    # Extract range from Column of Interest
    APPLY_RANGE <- COI[DATE_COL > LAST_DATE[i] & DATE_COL <= DATE_COL[i]]
    # Apply function to extracted range
    result[i] <- APPLY_FUNC(APPLY_RANGE, ...)
    if(verbose && i%%100 == 0) {
      ARL <- length(APPLY_RANGE)
      writeLines(sprintf("Last Date: %10s, Current Date: %10s, Iteration: %3d, Length: %3d, Mean: %.2f", 
                         LAST_DATE[i], DATE_COL[i], i, ARL, result[i]))
    }
  }

  result
}

注意,我还做了一个辅助函数来提取某些时间段(last_date),实现如下:

last_date <- function(x, n = 1, unit = c("day", "week", "month", "year")) {
  require(lubridate)

  # Stop function if x is not Class Date.
  if(!is.Date(x)) stop("x is not class: Date")
  if(any(is.na(x))) stop("x contains NA")

  # Match unit and Perform Calculation
  unit <- match.arg(unit)
  result <- switch(unit,
         day = x - n,
         week = x - (7L*n),
         month = x %m-% months(n),
         year = x %m-% months(12L*n))

  result
}

我面临的问题是,当我在一个小样本上运行它时,该函数按预期工作,但是当我将它扩展到完整数据集时它会失败(时间方面)。而且我无法弄清楚它是否是我所做的功能实现,这很慢。或者,如果是我在 data.table 中调用函数的方式。

library(data.table)
library(lubridate)

# Functions to apply -- I have multiple others, but these should work as example
functions <- c("mean", "median", "sd")

# Toy Data:
DT <- data.table(store = rep(1:10, each = 1000),
                 sales = rnorm(n = 10000, mean = 4500, sd = 2500),
                 date = rep(seq(ymd("2015-01-01"), by = "day", length.out = 1000), 10))

# How i call the ts_rollapply function
DT[, paste("sales_quarter", functions, sep = "_") := lapply(functions, function(x) ts_rollapply(sales, date, x, n = 3, unit = "month", na.rm = T)), store]

Any help on how to speed up my computation would be much appreciated!

4

1 回答 1

0

One way is to do a non-equi join

DT[, (cols) := 
    DT[.(STORE=STORE, START_DATE=DATE - 7L, END_DATE=DATE), 
        on=.(STORE, DATE>=START_DATE, DATE<=END_DATE),
        lapply(functions, function(f) get(f)(SALES)), by=.EACHI][, (1:3) := NULL]
    ]

A faster way should be to fill in the SALES for all dates and use data.table::frollapply as mentioned in the comments.

res <- DT[DT[, .(DATE=seq(min(DATE), max(DATE), by="1 day")), STORE], on=.(STORE, DATE)][,
    (cols) := lapply(functions, function(f) frollapply(SALES, 7L, f, na.rm=TRUE))]
DT[res, on=.(STORE, DATE), names(res) := mget(paste0("i.", names(res)))]

If the above suits your real-life problem, then we can create a function with it.

data:

library(data.table)    
functions <- c("mean", "median", "sd")
nr <- 1e6
DT <- data.table(STORE=rep(1:10, each=nr/10),
    SALES=rnorm(nr, 4500, 2500),
    DATE=rep(seq(as.IDate("2015-01-01"), by="day", length.out=nr/10), 10))
cols <- paste("sales_quarter", functions, sep = "_")
于 2019-10-23T03:13:07.227 回答