Hi,
I'm analyzing a large number of large simulation datasets, and I've
isolated one of the bottlenecks. Any help in speeding it up would be
appreciated.
`dat` is a dataframe of samples from a regular grid. The first two
columns are the spatial coordinates of the samples, the remaining 20
columns are the abundances of species in each cell. I need to calculate
the species richness in adjacent cells for each cell in the sample.
For example, if I have nine cells in my dataframe (X = 1:3, Y = 1:3):
a b c
d e f
g h i
I need to calculate the neighbour-richness for each cell; for a, this is
the richness of cells b, d and e combined. The neighbour richness of
cell e would be the combined richness of all the other eight cells.
The following code does what I what, but it's slow. The sample dataset
'dat', below, represents a 5x5 grid, 25 samples. It takes about 1.5
seconds on my computer. The largest samples I am working with have a 51
x 51 grid (2601 cells) and take 4.5 minutes. This is manageable, but
since I have potentially hundreds of these analyses to run, trimming
that down would be very helpful.
After loading the function and the data, the call
system.time(tmp <- time.test(dat))
Will run the code. Note that I've excised this from a larger, more
general function, after determining that for large datasets this section
is responsible for a slowdown from 10-12 seconds to ca. 250 seconds.
Thanks for your patience,
Tyler
time.test <- function(dat) {
cen <- dat
grps <- 5
n.rich <- numeric(grps^2)
n.ind <- 1
for(i in 1:grps)
for (j in 1:grps) {
n.cen <- numeric(ncol(cen) - 2)
neighbours <- expand.grid((j-1):(j+1), (i-1):(i+1))
neighbours <- neighbours[-5,]
neighbours <- neighbours[which(neighbours[,1] %in% 1:grps &
neighbours[,2] %in% 1:grps),]
for (k in 1:nrow(neighbours))
n.cen <- n.cen + cen[cen$X == neighbours[k,1] &
cen$Y == neighbours[k,2], -c(1:2)]
n.rich[n.ind] <- sum(as.logical(n.cen))
n.ind <- n.ind + 1
}
return(n.rich)
}
`dat` <- structure(list(
X = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1,
2, 3, 4, 5), Y = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4,
4, 4, 4, 4, 5, 5, 5, 5, 5), V1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 45L, 131L, 0L, 0L, 34L,
481L, 1744L), V2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 88L, 0L, 70L, 101L, 13L, 634L, 0L, 0L, 71L, 640L, 1636L), V3
= c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 49L, 3L, 113L,
1L, 44L, 167L, 336L, 933L, 0L, 14L, 388L, 1180L, 1709L), V4 = c(0L,
0L, 0L, 0L, 0L, 0L, 3L, 12L, 0L, 0L, 2L, 1L, 36L, 45L, 208L, 7L,
221L, 213L, 371L, 1440L, 26L, 211L, 389L, 1382L, 1614L), V5 = c(96L,
7L, 0L, 0L, 0L, 10L, 17L, 0L, 5L, 0L, 0L, 11L, 151L, 127L, 160L,
27L, 388L, 439L, 1117L, 1571L, 81L, 598L, 1107L, 1402L, 891L), V6 =
c(16L, 30L, 13L, 0L, 0L, 10L, 195L, 60L, 29L, 29L, 1L, 107L, 698L,
596L, 655L, 227L, 287L, 677L, 1477L, 1336L, 425L, 873L, 961L, 1360L,
1175L), V7 = c(249L, 101L, 69L, 0L, 18L, 186L, 331L, 291L, 259L,
248L, 336L, 404L, 642L, 632L, 775L, 455L, 801L, 697L, 1063L, 978L,
626L, 686L, 1204L, 1138L, 627L), V8 = c(300L, 163L, 65L, 145L, 377L,
257L, 690L, 655L, 420L, 288L, 346L, 461L, 1276L, 897L, 633L, 812L,
1018L, 1337L, 1295L, 1163L, 550L, 1104L, 768L, 933L, 433L), V9 =
c(555L, 478L, 374L, 349L, 357L, 360L, 905L, 954L, 552L, 438L, 703L,
984L, 1616L, 1732L, 1234L, 1213L, 1518L, 1746L, 1191L, 967L, 1394L,
1722L, 1706L, 610L, 169L), V10 = c(1527L, 1019L, 926L, 401L, 830L,
833L, 931L, 816L, 1126L, 1232L, 1067L, 1169L, 1270L, 1277L, 1145L,
1159L, 1072L, 1534L, 997L, 391L, 1328L, 1414L, 1037L, 444L, 1L), V11
= c(1468L, 1329L, 1013L, 603L, 1096L, 1237L, 1488L, 1189L, 1064L,
1303L, 1258L, 1479L, 1421L, 1365L, 1101L, 1415L, 1145L, 1329L,
1325L, 236L, 1379L, 1199L, 729L, 328L, 0L), V12 = c(983L, 1459L,
791L, 898L, 911L, 1215L, 1528L, 960L, 1172L, 1286L, 1358L, 722L,
857L, 1478L, 1452L, 1502L, 1013L, 745L, 455L, 149L, 1686L, 917L,
1013L, 84L, 0L), V13 = c(1326L, 1336L, 1110L, 1737L, 1062L, 1578L,
1382L, 1537L, 1366L, 1308L, 1301L, 1357L, 746L, 622L, 934L, 1132L,
954L, 460L, 270L, 65L, 957L, 699L, 521L, 18L, 1L), V14 = c(1047L,
1315L, 1506L, 1562L, 1254L, 1336L, 1106L, 1213L, 1220L, 1457L, 858L,
1606L, 590L, 726L, 598L, 945L, 732L, 258L, 45L, 6L, 937L, 436L, 43L,
0L, 0L), V15 = c(845L, 935L, 1295L, 1077L, 1400L, 1049L, 802L,
1247L, 1449L, 1046L, 1134L, 877L, 327L, 352L, 470L, 564L, 461L,
166L, 0L, 0L, 230L, 110L, 29L, 0L, 0L), V16 = c(784L, 675L, 1157L,
1488L, 1511L, 1004L, 420L, 523L, 733L, 724L, 833L, 542L, 171L, 116L,
384L, 357L, 197L, 0L, 0L, 0L, 246L, 0L, 0L, 0L, 0L), V17 = c(444L,
873L, 530L, 596L, 448L, 431L, 109L, 446L, 378L, 243L, 284L, 148L,
69L, 30L, 6L, 71L, 32L, 131L, 0L, 0L, 120L, 0L, 0L, 0L, 0L), V18 =
c(307L, 128L, 823L, 566L, 383L, 454L, 62L, 90L, 99L, 261L, 351L,
94L, 81L, 0L, 37L, 113L, 28L, 0L, 0L, 0L, 15L, 17L, 0L, 0L, 0L), V19
= c(53L, 138L, 207L, 451L, 282L, 40L, 31L, 7L, 128L, 137L, 166L,
38L, 0L, 1L, 0L, 0L, 19L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V20 =
c(0L, 14L, 121L, 127L, 71L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 7L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("X", "Y", "V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12",
"V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20"), row.names =
c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L,
19L, 20L, 21L, 22L, 23L, 25L, 26L, 27L, 28L, 29L), class =
"data.frame")
Need help optimizing/vectorizing nested loops
4 messages · Charles C. Berry, Tyler Smith, Bart Joosen
On Tue, 9 Dec 2008, tyler wrote:
Hi, I'm analyzing a large number of large simulation datasets, and I've isolated one of the bottlenecks. Any help in speeding it up would be appreciated.
Cast the neighborhoods as an indicator matrix, then use matrix multiplications:
system.time(tmp <- time.test(dat))
user system elapsed
1.18 0.00 1.20
system.time({
+ mn <- with(dat,outer(1:25,1:25, function(i,j) abs(X[i]-X[j])<2 & abs(Y[i]-Y[j])<2 & i!=j ))
+ print(all.equal(rowSums(mn%*%as.matrix(dat[,-(1:2)])>0),tmp))})
[1] TRUE
user system elapsed
0 0 0
HTH,
Chuck
`dat` is a dataframe of samples from a regular grid. The first two
columns are the spatial coordinates of the samples, the remaining 20
columns are the abundances of species in each cell. I need to calculate
the species richness in adjacent cells for each cell in the sample.
For example, if I have nine cells in my dataframe (X = 1:3, Y = 1:3):
a b c
d e f
g h i
I need to calculate the neighbour-richness for each cell; for a, this is
the richness of cells b, d and e combined. The neighbour richness of
cell e would be the combined richness of all the other eight cells.
The following code does what I what, but it's slow. The sample dataset
'dat', below, represents a 5x5 grid, 25 samples. It takes about 1.5
seconds on my computer. The largest samples I am working with have a 51
x 51 grid (2601 cells) and take 4.5 minutes. This is manageable, but
since I have potentially hundreds of these analyses to run, trimming
that down would be very helpful.
After loading the function and the data, the call
system.time(tmp <- time.test(dat))
Will run the code. Note that I've excised this from a larger, more
general function, after determining that for large datasets this section
is responsible for a slowdown from 10-12 seconds to ca. 250 seconds.
Thanks for your patience,
Tyler
time.test <- function(dat) {
cen <- dat
grps <- 5
n.rich <- numeric(grps^2)
n.ind <- 1
for(i in 1:grps)
for (j in 1:grps) {
n.cen <- numeric(ncol(cen) - 2)
neighbours <- expand.grid((j-1):(j+1), (i-1):(i+1))
neighbours <- neighbours[-5,]
neighbours <- neighbours[which(neighbours[,1] %in% 1:grps &
neighbours[,2] %in% 1:grps),]
for (k in 1:nrow(neighbours))
n.cen <- n.cen + cen[cen$X == neighbours[k,1] &
cen$Y == neighbours[k,2], -c(1:2)]
n.rich[n.ind] <- sum(as.logical(n.cen))
n.ind <- n.ind + 1
}
return(n.rich)
}
`dat` <- structure(list(
X = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1,
2, 3, 4, 5), Y = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4,
4, 4, 4, 4, 5, 5, 5, 5, 5), V1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 45L, 131L, 0L, 0L, 34L,
481L, 1744L), V2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 88L, 0L, 70L, 101L, 13L, 634L, 0L, 0L, 71L, 640L, 1636L), V3
= c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 49L, 3L, 113L,
1L, 44L, 167L, 336L, 933L, 0L, 14L, 388L, 1180L, 1709L), V4 = c(0L,
0L, 0L, 0L, 0L, 0L, 3L, 12L, 0L, 0L, 2L, 1L, 36L, 45L, 208L, 7L,
221L, 213L, 371L, 1440L, 26L, 211L, 389L, 1382L, 1614L), V5 = c(96L,
7L, 0L, 0L, 0L, 10L, 17L, 0L, 5L, 0L, 0L, 11L, 151L, 127L, 160L,
27L, 388L, 439L, 1117L, 1571L, 81L, 598L, 1107L, 1402L, 891L), V6 =
c(16L, 30L, 13L, 0L, 0L, 10L, 195L, 60L, 29L, 29L, 1L, 107L, 698L,
596L, 655L, 227L, 287L, 677L, 1477L, 1336L, 425L, 873L, 961L, 1360L,
1175L), V7 = c(249L, 101L, 69L, 0L, 18L, 186L, 331L, 291L, 259L,
248L, 336L, 404L, 642L, 632L, 775L, 455L, 801L, 697L, 1063L, 978L,
626L, 686L, 1204L, 1138L, 627L), V8 = c(300L, 163L, 65L, 145L, 377L,
257L, 690L, 655L, 420L, 288L, 346L, 461L, 1276L, 897L, 633L, 812L,
1018L, 1337L, 1295L, 1163L, 550L, 1104L, 768L, 933L, 433L), V9 =
c(555L, 478L, 374L, 349L, 357L, 360L, 905L, 954L, 552L, 438L, 703L,
984L, 1616L, 1732L, 1234L, 1213L, 1518L, 1746L, 1191L, 967L, 1394L,
1722L, 1706L, 610L, 169L), V10 = c(1527L, 1019L, 926L, 401L, 830L,
833L, 931L, 816L, 1126L, 1232L, 1067L, 1169L, 1270L, 1277L, 1145L,
1159L, 1072L, 1534L, 997L, 391L, 1328L, 1414L, 1037L, 444L, 1L), V11
= c(1468L, 1329L, 1013L, 603L, 1096L, 1237L, 1488L, 1189L, 1064L,
1303L, 1258L, 1479L, 1421L, 1365L, 1101L, 1415L, 1145L, 1329L,
1325L, 236L, 1379L, 1199L, 729L, 328L, 0L), V12 = c(983L, 1459L,
791L, 898L, 911L, 1215L, 1528L, 960L, 1172L, 1286L, 1358L, 722L,
857L, 1478L, 1452L, 1502L, 1013L, 745L, 455L, 149L, 1686L, 917L,
1013L, 84L, 0L), V13 = c(1326L, 1336L, 1110L, 1737L, 1062L, 1578L,
1382L, 1537L, 1366L, 1308L, 1301L, 1357L, 746L, 622L, 934L, 1132L,
954L, 460L, 270L, 65L, 957L, 699L, 521L, 18L, 1L), V14 = c(1047L,
1315L, 1506L, 1562L, 1254L, 1336L, 1106L, 1213L, 1220L, 1457L, 858L,
1606L, 590L, 726L, 598L, 945L, 732L, 258L, 45L, 6L, 937L, 436L, 43L,
0L, 0L), V15 = c(845L, 935L, 1295L, 1077L, 1400L, 1049L, 802L,
1247L, 1449L, 1046L, 1134L, 877L, 327L, 352L, 470L, 564L, 461L,
166L, 0L, 0L, 230L, 110L, 29L, 0L, 0L), V16 = c(784L, 675L, 1157L,
1488L, 1511L, 1004L, 420L, 523L, 733L, 724L, 833L, 542L, 171L, 116L,
384L, 357L, 197L, 0L, 0L, 0L, 246L, 0L, 0L, 0L, 0L), V17 = c(444L,
873L, 530L, 596L, 448L, 431L, 109L, 446L, 378L, 243L, 284L, 148L,
69L, 30L, 6L, 71L, 32L, 131L, 0L, 0L, 120L, 0L, 0L, 0L, 0L), V18 =
c(307L, 128L, 823L, 566L, 383L, 454L, 62L, 90L, 99L, 261L, 351L,
94L, 81L, 0L, 37L, 113L, 28L, 0L, 0L, 0L, 15L, 17L, 0L, 0L, 0L), V19
= c(53L, 138L, 207L, 451L, 282L, 40L, 31L, 7L, 128L, 137L, 166L,
38L, 0L, 1L, 0L, 0L, 19L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V20 =
c(0L, 14L, 121L, 127L, 71L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 7L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("X", "Y", "V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12",
"V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20"), row.names =
c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L,
19L, 20L, 21L, 22L, 23L, 25L, 26L, 27L, 28L, 29L), class =
"data.frame")
______________________________________________ R-help at r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.
Charles C. Berry (858) 534-2098
Dept of Family/Preventive Medicine
E mailto:cberry at tajo.ucsd.edu UC San Diego
http://famprevmed.ucsd.edu/faculty/cberry/ La Jolla, San Diego 92093-0901
Charles C. Berry writes:
> On Tue, 9 Dec 2008, tyler wrote:
>
> > I'm analyzing a large number of large simulation datasets, and I've
> > isolated one of the bottlenecks. Any help in speeding it up would be
> > appreciated.
>
> Cast the neighborhoods as an indicator matrix, then use matrix
> multiplications:
>
> > system.time({ mn <- with(dat,outer(1:25,1:25, function(i,j)
> abs(X[i]-X[j])<2 & abs(Y[i]-Y[j])<2 & i!=j ))
Wow, that's fantastic! On my biggest matrix, your code cost me 3.854
seconds, compared to 207.769 seconds with my original function. I
might be able to use `outer' to solve some other slowdowns.
Thanks,
Tyler
> HTH,
>
> Chuck
>
>
>
> >
> > `dat` is a dataframe of samples from a regular grid. The first two
> > columns are the spatial coordinates of the samples, the remaining 20
> > columns are the abundances of species in each cell. I need to calculate
> > the species richness in adjacent cells for each cell in the sample.
> > For example, if I have nine cells in my dataframe (X = 1:3, Y = 1:3):
> >
> > a b c
> > d e f
> > g h i
> >
> > I need to calculate the neighbour-richness for each cell; for a, this is
> > the richness of cells b, d and e combined. The neighbour richness of
> > cell e would be the combined richness of all the other eight cells.
> >
> > The following code does what I what, but it's slow. The sample dataset
> > 'dat', below, represents a 5x5 grid, 25 samples. It takes about 1.5
> > seconds on my computer. The largest samples I am working with have a 51
> > x 51 grid (2601 cells) and take 4.5 minutes. This is manageable, but
> > since I have potentially hundreds of these analyses to run, trimming
> > that down would be very helpful.
> >
> > After loading the function and the data, the call
> >
> > system.time(tmp <- time.test(dat))
> >
> > Will run the code. Note that I've excised this from a larger, more
> > general function, after determining that for large datasets this section
> > is responsible for a slowdown from 10-12 seconds to ca. 250 seconds.
> >
> > Thanks for your patience,
> >
> > Tyler
> >
> >
> > time.test <- function(dat) {
> >
> > cen <- dat
> > grps <- 5
> > n.rich <- numeric(grps^2)
> > n.ind <- 1
> >
> > for(i in 1:grps)
> > for (j in 1:grps) {
> > n.cen <- numeric(ncol(cen) - 2)
> > neighbours <- expand.grid((j-1):(j+1), (i-1):(i+1))
> > neighbours <- neighbours[-5,]
> > neighbours <- neighbours[which(neighbours[,1] %in% 1:grps &
> > neighbours[,2] %in% 1:grps),]
> >
> > for (k in 1:nrow(neighbours))
> > n.cen <- n.cen + cen[cen$X == neighbours[k,1] &
> > cen$Y == neighbours[k,2], -c(1:2)]
> >
> > n.rich[n.ind] <- sum(as.logical(n.cen))
> > n.ind <- n.ind + 1
> > }
> >
> > return(n.rich)
> > }
> >
> > `dat` <- structure(list(
> > X = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1,
> > 2, 3, 4, 5), Y = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4,
> > 4, 4, 4, 4, 5, 5, 5, 5, 5), V1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
> > 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 45L, 131L, 0L, 0L, 34L,
> > 481L, 1744L), V2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
> > 0L, 1L, 88L, 0L, 70L, 101L, 13L, 634L, 0L, 0L, 71L, 640L, 1636L), V3
> > = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 49L, 3L, 113L,
> > 1L, 44L, 167L, 336L, 933L, 0L, 14L, 388L, 1180L, 1709L), V4 = c(0L,
> > 0L, 0L, 0L, 0L, 0L, 3L, 12L, 0L, 0L, 2L, 1L, 36L, 45L, 208L, 7L,
> > 221L, 213L, 371L, 1440L, 26L, 211L, 389L, 1382L, 1614L), V5 = c(96L,
> > 7L, 0L, 0L, 0L, 10L, 17L, 0L, 5L, 0L, 0L, 11L, 151L, 127L, 160L,
> > 27L, 388L, 439L, 1117L, 1571L, 81L, 598L, 1107L, 1402L, 891L), V6 =
> > c(16L, 30L, 13L, 0L, 0L, 10L, 195L, 60L, 29L, 29L, 1L, 107L, 698L,
> > 596L, 655L, 227L, 287L, 677L, 1477L, 1336L, 425L, 873L, 961L, 1360L,
> > 1175L), V7 = c(249L, 101L, 69L, 0L, 18L, 186L, 331L, 291L, 259L,
> > 248L, 336L, 404L, 642L, 632L, 775L, 455L, 801L, 697L, 1063L, 978L,
> > 626L, 686L, 1204L, 1138L, 627L), V8 = c(300L, 163L, 65L, 145L, 377L,
> > 257L, 690L, 655L, 420L, 288L, 346L, 461L, 1276L, 897L, 633L, 812L,
> > 1018L, 1337L, 1295L, 1163L, 550L, 1104L, 768L, 933L, 433L), V9 =
> > c(555L, 478L, 374L, 349L, 357L, 360L, 905L, 954L, 552L, 438L, 703L,
> > 984L, 1616L, 1732L, 1234L, 1213L, 1518L, 1746L, 1191L, 967L, 1394L,
> > 1722L, 1706L, 610L, 169L), V10 = c(1527L, 1019L, 926L, 401L, 830L,
> > 833L, 931L, 816L, 1126L, 1232L, 1067L, 1169L, 1270L, 1277L, 1145L,
> > 1159L, 1072L, 1534L, 997L, 391L, 1328L, 1414L, 1037L, 444L, 1L), V11
> > = c(1468L, 1329L, 1013L, 603L, 1096L, 1237L, 1488L, 1189L, 1064L,
> > 1303L, 1258L, 1479L, 1421L, 1365L, 1101L, 1415L, 1145L, 1329L,
> > 1325L, 236L, 1379L, 1199L, 729L, 328L, 0L), V12 = c(983L, 1459L,
> > 791L, 898L, 911L, 1215L, 1528L, 960L, 1172L, 1286L, 1358L, 722L,
> > 857L, 1478L, 1452L, 1502L, 1013L, 745L, 455L, 149L, 1686L, 917L,
> > 1013L, 84L, 0L), V13 = c(1326L, 1336L, 1110L, 1737L, 1062L, 1578L,
> > 1382L, 1537L, 1366L, 1308L, 1301L, 1357L, 746L, 622L, 934L, 1132L,
> > 954L, 460L, 270L, 65L, 957L, 699L, 521L, 18L, 1L), V14 = c(1047L,
> > 1315L, 1506L, 1562L, 1254L, 1336L, 1106L, 1213L, 1220L, 1457L, 858L,
> > 1606L, 590L, 726L, 598L, 945L, 732L, 258L, 45L, 6L, 937L, 436L, 43L,
> > 0L, 0L), V15 = c(845L, 935L, 1295L, 1077L, 1400L, 1049L, 802L,
> > 1247L, 1449L, 1046L, 1134L, 877L, 327L, 352L, 470L, 564L, 461L,
> > 166L, 0L, 0L, 230L, 110L, 29L, 0L, 0L), V16 = c(784L, 675L, 1157L,
> > 1488L, 1511L, 1004L, 420L, 523L, 733L, 724L, 833L, 542L, 171L, 116L,
> > 384L, 357L, 197L, 0L, 0L, 0L, 246L, 0L, 0L, 0L, 0L), V17 = c(444L,
> > 873L, 530L, 596L, 448L, 431L, 109L, 446L, 378L, 243L, 284L, 148L,
> > 69L, 30L, 6L, 71L, 32L, 131L, 0L, 0L, 120L, 0L, 0L, 0L, 0L), V18 =
> > c(307L, 128L, 823L, 566L, 383L, 454L, 62L, 90L, 99L, 261L, 351L,
> > 94L, 81L, 0L, 37L, 113L, 28L, 0L, 0L, 0L, 15L, 17L, 0L, 0L, 0L), V19
> > = c(53L, 138L, 207L, 451L, 282L, 40L, 31L, 7L, 128L, 137L, 166L,
> > 38L, 0L, 1L, 0L, 0L, 19L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V20 =
> > c(0L, 14L, 121L, 127L, 71L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 7L,
> > 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("X", "Y", "V1",
> > "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12",
> > "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20"), row.names =
> > c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L,
> > 19L, 20L, 21L, 22L, 23L, 25L, 26L, 27L, 28L, 29L), class =
> > "data.frame")
> >
> > ______________________________________________
> > R-help at r-project.org mailing list
> > https://stat.ethz.ch/mailman/listinfo/r-help
> > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> > and provide commented, minimal, self-contained, reproducible code.
> >
>
> Charles C. Berry (858) 534-2098
> Dept of Family/Preventive Medicine
> E mailto:cberry at tajo.ucsd.edu UC San Diego
> http://famprevmed.ucsd.edu/faculty/cberry/ La Jolla, San Diego 92093-0901
>
>
There is no theory of evolution. Just a list of animals Chuck Norris allows to live. http://www.chucknorrisfacts.com/
Hello,
how about changing the last loop to an apply?
time.test2 <- function(dat) {
cen <- dat
grps <- 5
n.rich <- numeric(grps^2)
n.ind <- 1
for(i in 1:grps)
for (j in 1:grps) {
n.cen <- numeric(ncol(cen) - 2)
neighbours <- expand.grid(X=(j-1):(j+1), Y=(i-1):(i+1))
neighbours <- neighbours[-5,]
neighbours <- neighbours[which(neighbours[,1] %in% 1:grps &
neighbours[,2] %in% 1:grps),]
n.rich[n.ind] <-
sum(as.logical(apply(merge(neighbours,cen)[,-c(1,2)],2,sum)))
n.ind <- n.ind + 1
}
return(n.rich)
}
The timings on my system:
system.time(time.test(dat))
user system elapsed
1.65 0.03 1.71
system.time(time.test2(dat))
user system elapsed
0.27 0.00 0.27
I'm still thinking about the optimisation for the selection of the cells,
but for the moment I have no clue about any optimisation for this problem.
Kind Regards
Bart
Tyler Smith wrote:
Hi,
I'm analyzing a large number of large simulation datasets, and I've
isolated one of the bottlenecks. Any help in speeding it up would be
appreciated.
`dat` is a dataframe of samples from a regular grid. The first two
columns are the spatial coordinates of the samples, the remaining 20
columns are the abundances of species in each cell. I need to calculate
the species richness in adjacent cells for each cell in the sample.
For example, if I have nine cells in my dataframe (X = 1:3, Y = 1:3):
a b c
d e f
g h i
I need to calculate the neighbour-richness for each cell; for a, this is
the richness of cells b, d and e combined. The neighbour richness of
cell e would be the combined richness of all the other eight cells.
The following code does what I what, but it's slow. The sample dataset
'dat', below, represents a 5x5 grid, 25 samples. It takes about 1.5
seconds on my computer. The largest samples I am working with have a 51
x 51 grid (2601 cells) and take 4.5 minutes. This is manageable, but
since I have potentially hundreds of these analyses to run, trimming
that down would be very helpful.
After loading the function and the data, the call
system.time(tmp <- time.test(dat))
Will run the code. Note that I've excised this from a larger, more
general function, after determining that for large datasets this section
is responsible for a slowdown from 10-12 seconds to ca. 250 seconds.
Thanks for your patience,
Tyler
time.test <- function(dat) {
cen <- dat
grps <- 5
n.rich <- numeric(grps^2)
n.ind <- 1
for(i in 1:grps)
for (j in 1:grps) {
n.cen <- numeric(ncol(cen) - 2)
neighbours <- expand.grid((j-1):(j+1), (i-1):(i+1))
neighbours <- neighbours[-5,]
neighbours <- neighbours[which(neighbours[,1] %in% 1:grps &
neighbours[,2] %in% 1:grps),]
for (k in 1:nrow(neighbours))
n.cen <- n.cen + cen[cen$X == neighbours[k,1] &
cen$Y == neighbours[k,2], -c(1:2)]
n.rich[n.ind] <- sum(as.logical(n.cen))
n.ind <- n.ind + 1
}
return(n.rich)
}
`dat` <- structure(list(
X = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1,
2, 3, 4, 5), Y = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4,
4, 4, 4, 4, 5, 5, 5, 5, 5), V1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 45L, 131L, 0L, 0L, 34L,
481L, 1744L), V2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 88L, 0L, 70L, 101L, 13L, 634L, 0L, 0L, 71L, 640L, 1636L), V3
= c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 49L, 3L, 113L,
1L, 44L, 167L, 336L, 933L, 0L, 14L, 388L, 1180L, 1709L), V4 = c(0L,
0L, 0L, 0L, 0L, 0L, 3L, 12L, 0L, 0L, 2L, 1L, 36L, 45L, 208L, 7L,
221L, 213L, 371L, 1440L, 26L, 211L, 389L, 1382L, 1614L), V5 = c(96L,
7L, 0L, 0L, 0L, 10L, 17L, 0L, 5L, 0L, 0L, 11L, 151L, 127L, 160L,
27L, 388L, 439L, 1117L, 1571L, 81L, 598L, 1107L, 1402L, 891L), V6 =
c(16L, 30L, 13L, 0L, 0L, 10L, 195L, 60L, 29L, 29L, 1L, 107L, 698L,
596L, 655L, 227L, 287L, 677L, 1477L, 1336L, 425L, 873L, 961L, 1360L,
1175L), V7 = c(249L, 101L, 69L, 0L, 18L, 186L, 331L, 291L, 259L,
248L, 336L, 404L, 642L, 632L, 775L, 455L, 801L, 697L, 1063L, 978L,
626L, 686L, 1204L, 1138L, 627L), V8 = c(300L, 163L, 65L, 145L, 377L,
257L, 690L, 655L, 420L, 288L, 346L, 461L, 1276L, 897L, 633L, 812L,
1018L, 1337L, 1295L, 1163L, 550L, 1104L, 768L, 933L, 433L), V9 =
c(555L, 478L, 374L, 349L, 357L, 360L, 905L, 954L, 552L, 438L, 703L,
984L, 1616L, 1732L, 1234L, 1213L, 1518L, 1746L, 1191L, 967L, 1394L,
1722L, 1706L, 610L, 169L), V10 = c(1527L, 1019L, 926L, 401L, 830L,
833L, 931L, 816L, 1126L, 1232L, 1067L, 1169L, 1270L, 1277L, 1145L,
1159L, 1072L, 1534L, 997L, 391L, 1328L, 1414L, 1037L, 444L, 1L), V11
= c(1468L, 1329L, 1013L, 603L, 1096L, 1237L, 1488L, 1189L, 1064L,
1303L, 1258L, 1479L, 1421L, 1365L, 1101L, 1415L, 1145L, 1329L,
1325L, 236L, 1379L, 1199L, 729L, 328L, 0L), V12 = c(983L, 1459L,
791L, 898L, 911L, 1215L, 1528L, 960L, 1172L, 1286L, 1358L, 722L,
857L, 1478L, 1452L, 1502L, 1013L, 745L, 455L, 149L, 1686L, 917L,
1013L, 84L, 0L), V13 = c(1326L, 1336L, 1110L, 1737L, 1062L, 1578L,
1382L, 1537L, 1366L, 1308L, 1301L, 1357L, 746L, 622L, 934L, 1132L,
954L, 460L, 270L, 65L, 957L, 699L, 521L, 18L, 1L), V14 = c(1047L,
1315L, 1506L, 1562L, 1254L, 1336L, 1106L, 1213L, 1220L, 1457L, 858L,
1606L, 590L, 726L, 598L, 945L, 732L, 258L, 45L, 6L, 937L, 436L, 43L,
0L, 0L), V15 = c(845L, 935L, 1295L, 1077L, 1400L, 1049L, 802L,
1247L, 1449L, 1046L, 1134L, 877L, 327L, 352L, 470L, 564L, 461L,
166L, 0L, 0L, 230L, 110L, 29L, 0L, 0L), V16 = c(784L, 675L, 1157L,
1488L, 1511L, 1004L, 420L, 523L, 733L, 724L, 833L, 542L, 171L, 116L,
384L, 357L, 197L, 0L, 0L, 0L, 246L, 0L, 0L, 0L, 0L), V17 = c(444L,
873L, 530L, 596L, 448L, 431L, 109L, 446L, 378L, 243L, 284L, 148L,
69L, 30L, 6L, 71L, 32L, 131L, 0L, 0L, 120L, 0L, 0L, 0L, 0L), V18 =
c(307L, 128L, 823L, 566L, 383L, 454L, 62L, 90L, 99L, 261L, 351L,
94L, 81L, 0L, 37L, 113L, 28L, 0L, 0L, 0L, 15L, 17L, 0L, 0L, 0L), V19
= c(53L, 138L, 207L, 451L, 282L, 40L, 31L, 7L, 128L, 137L, 166L,
38L, 0L, 1L, 0L, 0L, 19L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), V20 =
c(0L, 14L, 121L, 127L, 71L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 7L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("X", "Y", "V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12",
"V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20"), row.names =
c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L,
19L, 20L, 21L, 22L, 23L, 25L, 26L, 27L, 28L, 29L), class =
"data.frame")
______________________________________________ R-help at r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.
View this message in context: http://www.nabble.com/Need-help-optimizing-vectorizing-nested-loops-tp20916326p20919781.html Sent from the R help mailing list archive at Nabble.com.