Skip to content

vectorisation

4 messages · Brett Robinson, arun, Rui Barradas +1 more

#
Hi,

Not sure this helps:

ml <- data.frame(matrix(sample(1:50,80, replace=TRUE),20,4))
mm <- apply(ml, 2, cumsum)
starts<- data.frame(matrix(0,600,4))
starts1<- data.frame(matrix(0,600,4))
for (i in 1:4){
starts1[,i][mm[,i]] <-1
}

starts2<-as.data.frame(do.call(cbind,lapply(1:4,function(i) {starts[,i][mm[,i]]<-1;starts[,i]})))
colnames(starts2)<- colnames(starts1)
identical(starts1,starts2)
#[1] TRUE

#loop should be fast compared to lapply
ml <- data.frame(matrix(sample(1:1e4,1e3, replace=TRUE),100,4))
mm <- apply(ml, 2, cumsum)
starts<- data.frame(matrix(0,1e6,4))
starts1<- data.frame(matrix(0,1e6,4))
system.time({for (i in 1:4){
starts1[,i][mm[,i]] <-1
}})
?#user ?system elapsed?
? # 0.63 ? ?0.30 ? ?0.92?
system.time(starts2<-as.data.frame(do.call(cbind,lapply(1:4,function(i) {starts[,i][mm[,i]]<-1;starts[,i]}))))

# user ?system elapsed?
?# ?0.74 ? ?0.26 ? ?1.03?

colnames(starts2)<- colnames(starts1)
identical(starts1,starts2)
#[1] TRUE
A.K.



----- Original Message -----
From: Brett Robinson <brett.robinson at 7dials.com>
To: "r-help at r-project.org" <r-help at r-project.org>
Cc: 
Sent: Saturday, February 2, 2013 11:38 AM
Subject: [R] vectorisation

Hi
I'm trying to set up a simulation problem without resorting to (m)any loops. I want to set entries in a data frame of zeros ('starts' in the code below) to 1 at certain points and the points have been randomly generated and stored in a separate data.frame ('sl'), which has the same number of columns.

An example of the procedure is as follows:
ml <- data.frame(matrix(sample(1:50,80, replace=TRUE),20,4))
mm <- apply(ml, 2, cumsum)
starts<- data.frame(matrix(0,600,4))

I can achieve the result I want with a loop:
for (i in 1:4){
lstarts[,i][mm[,i]] <-1
}

But as I want to use a large number of columns I would like to do away with the loop

Can anyone suggest how this might be done?

Thanks in advance

Brett Robinson

______________________________________________________________________
This email has been scanned by the Symantec Email Security.cloud service.
For more information please visit http://www.symanteccloud.com
______________________________________________________________________
??? [[alternative HTML version deleted]]

______________________________________________
R-help at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
#
Hello,

Try the following.

set.seed(4315)
ml <- data.frame(matrix(sample(1:50,80, replace=TRUE),20,4))
mm <- apply(ml, 2, cumsum)
s2 <- starts <- data.frame(matrix(0,600,4))

for (i in 1:4){
	starts[,i][mm[,i]] <- 1
}

s2[] <- lapply(seq_len(ncol(mm)), function(i) {s2[,i][mm[,i]] <- 1; s2[,i]})

identical(s2, starts) # TRUE


Note that lapply is a loop in disguise.

Hope this helps,

Rui Barradas

Em 02-02-2013 16:38, Brett Robinson escreveu:
#
On 02-02-2013, at 17:38, Brett Robinson <brett.robinson at 7dials.com> wrote:

            
Another way is this

f2 <- function(starts, mm) {
    mn <- cbind(as.vector(mm),rep(1:ncol(mm),each=nrow(mm)))
    x <- as.matrix(starts) 
    x[mn] <- 1              
    as.data.frame(x)
}

starts2 <- f2(starts,mm)
#> identical(starts2,starts1)
# [1] TRUE

Collect all the options presented so far in functions, use the compiler package to see if that helps
and do some speed tests with Arun's parameters.

# Brett
f1 <- function(starts, mm) {
    for (i in 1:ncol(mm)){
        starts[,i][mm[,i]] <-1
    }
    starts
}

# Berend
f2 <- function(starts, mm) {
    mn <- cbind(as.vector(mm),rep(1:ncol(mm),each=nrow(mm)))
    x <- as.matrix(starts) 
    x[mn] <- 1              
    as.data.frame(x)
}

# Rui
f3 <- function(s2,mm) {
    s2[] <- lapply(seq_len(ncol(mm)), function(i) {s2[,i][mm[,i]] <- 1; s2[,i]})
    s2
}

# Arun
f4 <- function(starts,mm) {
    starts2 <- as.data.frame(do.call(cbind,lapply(1:ncol(mm),function(i) {starts[,i][mm[,i]]<-1;starts[,i]})))
    colnames(starts2)<- colnames(starts)
    starts2
}

library(compiler)
f1c <- cmpfun(f1)
f2c <- cmpfun(f2)
f3c <- cmpfun(f3)
f4c <- cmpfun(f4)

library(rbenchmark)

# Arun's test
set.seed(11)
starts <- data.frame(matrix(0,1e6,4))
ml <- data.frame(matrix(sample(1:1e4,1e3, replace=TRUE),100,4))
mm <- apply(ml, 2, cumsum)

z1 <- f1(starts,mm)
z2 <- f2(starts,mm)
z3 <- f3(starts,mm)
z4 <- f4(starts,mm)
z1c <- f1c(starts,mm)
z2c <- f2c(starts,mm)
z3c <- f3c(starts,mm)
z4c <- f4c(starts,mm)

identical(z2,z1)
identical(z3,z1)
identical(z4,z1)
identical(z1c,z1)
identical(z2c,z1)
identical(z3c,z1)
identical(z4c,z1)

benchmark( f1(starts,mm) , f2(starts,mm),
           f1c(starts,mm), f2c(starts,mm),
           f3(starts,mm) , f4(starts,mm),
           f3c(starts,mm), f4c(starts,mm),
           replications=1,order="relative", columns=c("test","relative","elapsed","replications"))

Result:

# > identical(z2,z1)
# [1] TRUE
# > identical(z3,z1)
# [1] TRUE
# > identical(z4,z1)
# [1] TRUE
# > identical(z1c,z1)
# [1] TRUE
# > identical(z2c,z1)
# [1] TRUE
# > identical(z3c,z1)
# [1] TRUE
# > identical(z4c,z1)
# [1] TRUE
# > 
# > benchmark( f1(starts,mm) , f2(starts,mm),
# +            f1c(starts,mm), f2c(starts,mm),
# +            f3(starts,mm) , f4(starts,mm),
# +            f3c(starts,mm), f4c(starts,mm),
# +            replications=1,order="relative", columns=c("test","relative","elapsed","replications"))
#              test relative elapsed replications
# 2  f2(starts, mm)    1.000   0.195            1
# 4 f2c(starts, mm)    1.005   0.196            1
# 1  f1(starts, mm)    2.990   0.583            1
# 3 f1c(starts, mm)    3.082   0.601            1
# 7 f3c(starts, mm)    3.903   0.761            1
# 5  f3(starts, mm)    3.949   0.770            1
# 8 f4c(starts, mm)    4.436   0.865            1
# 6  f4(starts, mm)    4.462   0.870            1

Compiling doesn't deliver significant speed gains in this case.
Function f2 is the quickest.

Berend