uniq -c

I need an analogue of "uniq -c" for a data frame.

xtabs(), although dog slow, would have footed the bill nicely:
--8<---------------cut here---------------start------------->8---
x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32)
system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))
user  system elapsed 
 12.788   4.288  17.224
--8<---------------cut here---------------end--------------->8---
but, alas, if fails on larger data:
system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))
Error in table(a = 1:32, b = 1:32, c = 1:32, d = 1:32, e = 1:32, f = 1:32,  : 
  attempt to make a table with >= 2^31 elements
(apparently, because the product of the numbers of all the possible
values of all the columns is too large).

rle() seems to be what I really need, but I cannot figure out what it
returns for a simple example:
--8<---------------cut here---------------start------------->8---
x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32,f=1:32,g=1:32,h=1:32)
rle(x)
Run Length Encoding
  lengths: int 8
  values :'data.frame':	32 obs. of  1 variable:
 $ h: int  1 2 3 4 5 6 7 8 9 10 ...
--8<---------------cut here---------------end--------------->8---
(where are all the other columns?)

and it fails on my actual data (3 column of factors):
Error in Ops.factor(left, right) : level sets of factors are different

when I replace factors with strings, I get:
Error in `[.data.frame`(x, i) : undefined columns selected

dput:
--8<---------------cut here---------------start------------->8---
structure(list(user = c("45ff768774777593", "45ff768774777593", 
"45ff768774777593", "45ff768774777593", "45ff768774777593", "4bbf9e94cbceb70c", 
"4bbf9e94cbceb70c", "4fbbf2c67e0fb867", "4fbbf2c67e0fb867", "5038d46739f9f516", 
"4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", 
"4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4fe9e0496ecfc55e", 
"4fe9e0496ecfc55e", "4fe9e0496ecfc55e", "506b92707a3aa65f", "502c0a9ba9ce5019", 
"502c0a9ba9ce5019", "502c0a9ba9ce5019", "502c0a9ba9ce5019", "501b52fe24d88162", 
"4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", 
"4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", 
"4fd4852ed504b160", "4fd4852ed504b160", "4e717c219268b736", "4e717c219268b736", 
"4e717c219268b736", "506bb429eeab2af4", "506bb429eeab2af4", "506bb429eeab2af4", 
"4f6f91cb83e1a7ef", "506bb8b62bde3c48", "506bb8b62bde3c48", "506bb8b62bde3c48", 
"506bb8b62bde3c48", "4edff2aeb4df7613", "4edff2aeb4df7613", "506bba652fa6bf78", 
"506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", 
"506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", 
"506a4941b50ca422", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", 
"5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", 
"5036993a16b323d1", "506bb525ffce3add", "506bb6cf52819b5f", "506bb6cf52819b5f", 
"4fe02e08a6a2ce64", "4fe02e08a6a2ce64", "5056d247bc7dae0b", "5056d247bc7dae0b", 
"5056d247bc7dae0b", "506abfecf61bf4e6", "4ec00aa243745ee5", "4ec00aa243745ee5", 
"4ec00aa243745ee5", "4ec00aa243745ee5", "506c69d7fb598afa", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111"), country = c("AR", "AR", 
"AR", "AR", "AR", "BG", "BG", "SK", "SK", "US", "VE", "VE", "VE", 
"VE", "VE", "VE", "VE", "TH", "TH", "TH", "MY", "US", "US", "US", 
"US", "MX", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", 
"JP", "US", "US", "US", "US", "US", "US", "NP", "US", "US", "US", 
"US", "MX", "MX", "US", "AR", "AR", "AR", "AR", "AR", "AR", "AR", 
"AR", "AR", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "US", 
"MM", "MM", "US", "US", "IN", "IN", "IN", "IN", "CA", "CA", "CA", 
"CA", "US", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", 
"DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE"), 
    language = c("es", "es", "es", "es", "es", "bg", "bg", "sk", 
    "sk", "en", "es", "es", "es", "es", "es", "es", "es", "th", 
    "th", "th", "en", "en", "en", "en", "en", "es", "en", "en", 
    "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", 
    "en", "en", "en", "en", "en", "en", "en", "en", "en", "es", 
    "es", "en", "es", "es", "es", "es", "es", "es", "es", "es", 
    "es", "es", "es", "es", "es", "es", "es", "es", "es", "en", 
    "en", "en", "en", "en", "en", "en", "en", "en", "fr", "fr", 
    "fr", "fr", "en", "de", "de", "de", "de", "de", "de", "de", 
    "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", 
    "de", "de")), .Names = c("user", "country", "language"), row.names = c(1L, 
2L, 3229330L, 3229331L, 3229332L, 3L, 9504492L, 4L, 9504493L, 
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 4989444L, 
4989445L, 4989446L, 18L, 19L, 20L, 21L, 22L, 95114L, 95115L, 
95116L, 95117L, 9504509L, 9599604L, 23L, 3319263L, 3319264L, 
24L, 25L, 9504513L, 26L, 27L, 28L, 7548719L, 7548720L, 29L, 30L, 
31L, 32L, 2498862L, 2498863L, 2498864L, 2498865L, 4560918L, 4560919L, 
5938642L, 14065408L, 33L, 4348151L, 4348152L, 5627634L, 5627635L, 
9504522L, 13852641L, 15132124L, 34L, 35L, 6400711L, 36L, 2173763L, 
37L, 38L, 39L, 40L, 41L, 10210641L, 10672811L, 16158334L, 42L, 
43L, 44L, 45L, 1974646L, 4032952L, 4032953L, 4032954L, 4032955L, 
4032956L, 4032957L, 4032958L, 4475376L, 4475377L, 4475378L, 4475379L, 
5500564L, 7871329L, 7871330L, 8670694L), class = "data.frame")
--8<---------------cut here---------------end--------------->8---

thanks!
Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000
http://www.childpsy.net/ http://thereligionofpeace.com http://dhimmi.com
http://ffii.org http://truepeace.org http://mideasttruth.com
Bus error -- please leave by the rear door.
Have you looked at using table() directly? If I understand what you
want correctly something like:

table(do.call(paste, x))

Also, if you take a look at the development version of R, changes are
being put in place to allow much larger data sets.

Cheers,
Micael
I need an analogue of "uniq -c" for a data frame.

xtabs(), although dog slow, would have footed the bill nicely:
--8<---------------cut here---------------start------------->8---
x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32)
system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))
   user  system elapsed
 12.788   4.288  17.224
--8<---------------cut here---------------end--------------->8---
but, alas, if fails on larger data:
system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))
Error in table(a = 1:32, b = 1:32, c = 1:32, d = 1:32, e = 1:32, f = 1:32,  :
  attempt to make a table with >= 2^31 elements
(apparently, because the product of the numbers of all the possible
values of all the columns is too large).

rle() seems to be what I really need, but I cannot figure out what it
returns for a simple example:
--8<---------------cut here---------------start------------->8---
x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32,f=1:32,g=1:32,h=1:32)
rle(x)
Run Length Encoding
  lengths: int 8
  values :'data.frame': 32 obs. of  1 variable:
 $ h: int  1 2 3 4 5 6 7 8 9 10 ...
--8<---------------cut here---------------end--------------->8---
(where are all the other columns?)

and it fails on my actual data (3 column of factors):
Error in Ops.factor(left, right) : level sets of factors are different

when I replace factors with strings, I get:
Error in `[.data.frame`(x, i) : undefined columns selected

dput:
--8<---------------cut here---------------start------------->8---
structure(list(user = c("45ff768774777593", "45ff768774777593",
"45ff768774777593", "45ff768774777593", "45ff768774777593", "4bbf9e94cbceb70c",
"4bbf9e94cbceb70c", "4fbbf2c67e0fb867", "4fbbf2c67e0fb867", "5038d46739f9f516",
"4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e",
"4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4fe9e0496ecfc55e",
"4fe9e0496ecfc55e", "4fe9e0496ecfc55e", "506b92707a3aa65f", "502c0a9ba9ce5019",
"502c0a9ba9ce5019", "502c0a9ba9ce5019", "502c0a9ba9ce5019", "501b52fe24d88162",
"4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160",
"4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160",
"4fd4852ed504b160", "4fd4852ed504b160", "4e717c219268b736", "4e717c219268b736",
"4e717c219268b736", "506bb429eeab2af4", "506bb429eeab2af4", "506bb429eeab2af4",
"4f6f91cb83e1a7ef", "506bb8b62bde3c48", "506bb8b62bde3c48", "506bb8b62bde3c48",
"506bb8b62bde3c48", "4edff2aeb4df7613", "4edff2aeb4df7613", "506bba652fa6bf78",
"506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422",
"506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422",
"506a4941b50ca422", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1",
"5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1",
"5036993a16b323d1", "506bb525ffce3add", "506bb6cf52819b5f", "506bb6cf52819b5f",
"4fe02e08a6a2ce64", "4fe02e08a6a2ce64", "5056d247bc7dae0b", "5056d247bc7dae0b",
"5056d247bc7dae0b", "506abfecf61bf4e6", "4ec00aa243745ee5", "4ec00aa243745ee5",
"4ec00aa243745ee5", "4ec00aa243745ee5", "506c69d7fb598afa", "5065a0c59c59c111",
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111",
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111",
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111",
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111",
"5065a0c59c59c111", "5065a0c59c59c111"), country = c("AR", "AR",
"AR", "AR", "AR", "BG", "BG", "SK", "SK", "US", "VE", "VE", "VE",
"VE", "VE", "VE", "VE", "TH", "TH", "TH", "MY", "US", "US", "US",
"US", "MX", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP",
"JP", "US", "US", "US", "US", "US", "US", "NP", "US", "US", "US",
"US", "MX", "MX", "US", "AR", "AR", "AR", "AR", "AR", "AR", "AR",
"AR", "AR", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "US",
"MM", "MM", "US", "US", "IN", "IN", "IN", "IN", "CA", "CA", "CA",
"CA", "US", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE",
"DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE"),
    language = c("es", "es", "es", "es", "es", "bg", "bg", "sk",
    "sk", "en", "es", "es", "es", "es", "es", "es", "es", "th",
    "th", "th", "en", "en", "en", "en", "en", "es", "en", "en",
    "en", "en", "en", "en", "en", "en", "en", "en", "en", "en",
    "en", "en", "en", "en", "en", "en", "en", "en", "en", "es",
    "es", "en", "es", "es", "es", "es", "es", "es", "es", "es",
    "es", "es", "es", "es", "es", "es", "es", "es", "es", "en",
    "en", "en", "en", "en", "en", "en", "en", "en", "fr", "fr",
    "fr", "fr", "en", "de", "de", "de", "de", "de", "de", "de",
    "de", "de", "de", "de", "de", "de", "de", "de", "de", "de",
    "de", "de")), .Names = c("user", "country", "language"), row.names = c(1L,
2L, 3229330L, 3229331L, 3229332L, 3L, 9504492L, 4L, 9504493L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 4989444L,
4989445L, 4989446L, 18L, 19L, 20L, 21L, 22L, 95114L, 95115L,
95116L, 95117L, 9504509L, 9599604L, 23L, 3319263L, 3319264L,
24L, 25L, 9504513L, 26L, 27L, 28L, 7548719L, 7548720L, 29L, 30L,
31L, 32L, 2498862L, 2498863L, 2498864L, 2498865L, 4560918L, 4560919L,
5938642L, 14065408L, 33L, 4348151L, 4348152L, 5627634L, 5627635L,
9504522L, 13852641L, 15132124L, 34L, 35L, 6400711L, 36L, 2173763L,
37L, 38L, 39L, 40L, 41L, 10210641L, 10672811L, 16158334L, 42L,
43L, 44L, 45L, 1974646L, 4032952L, 4032953L, 4032954L, 4032955L,
4032956L, 4032957L, 4032958L, 4475376L, 4475377L, 4475378L, 4475379L,
5500564L, 7871329L, 7871330L, 8670694L), class = "data.frame")
--8<---------------cut here---------------end--------------->8---

thanks!

--
Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000
http://www.childpsy.net/ http://thereligionofpeace.com http://dhimmi.com
http://ffii.org http://truepeace.org http://mideasttruth.com
Bus error -- please leave by the rear door.

______________________________________________
R-help at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
Sam Steingold <sds at gnu.org> writes:
I need an analogue of "uniq -c" for a data frame.
The count.rows() function is the R analogue.

See

http://orgmode.org/worg/org-contrib/babel/examples/Rpackage.html#sec-6-1

No need to install the package - just copy and paste the function into an
R session.

On cases I've tried that are big enough to matter, it is a good deal
faster than the table( do.call( paste, x )) idiom.

HTH,

Chuck
xtabs(), although dog slow, would have footed the bill nicely:
--8<---------------cut here---------------start------------->8---
x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32)
system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))
   user  system elapsed 
 12.788   4.288  17.224
--8<---------------cut here---------------end--------------->8---
but, alas, if fails on larger data:
system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))
Error in table(a = 1:32, b = 1:32, c = 1:32, d = 1:32, e = 1:32, f = 1:32,  : 
  attempt to make a table with >= 2^31 elements
(apparently, because the product of the numbers of all the possible
values of all the columns is too large).

rle() seems to be what I really need, but I cannot figure out what it
returns for a simple example:
--8<---------------cut here---------------start------------->8---
x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32,f=1:32,g=1:32,h=1:32)
rle(x)
Run Length Encoding
  lengths: int 8
  values :'data.frame':	32 obs. of  1 variable:
 $ h: int  1 2 3 4 5 6 7 8 9 10 ...
--8<---------------cut here---------------end--------------->8---
(where are all the other columns?)

and it fails on my actual data (3 column of factors):
Error in Ops.factor(left, right) : level sets of factors are different

when I replace factors with strings, I get:
Error in `[.data.frame`(x, i) : undefined columns selected

dput:
--8<---------------cut here---------------start------------->8---
structure(list(user = c("45ff768774777593", "45ff768774777593", 
"45ff768774777593", "45ff768774777593", "45ff768774777593", "4bbf9e94cbceb70c", 
"4bbf9e94cbceb70c", "4fbbf2c67e0fb867", "4fbbf2c67e0fb867", "5038d46739f9f516", 
"4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", 
"4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4fe9e0496ecfc55e", 
"4fe9e0496ecfc55e", "4fe9e0496ecfc55e", "506b92707a3aa65f", "502c0a9ba9ce5019", 
"502c0a9ba9ce5019", "502c0a9ba9ce5019", "502c0a9ba9ce5019", "501b52fe24d88162", 
"4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", 
"4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", 
"4fd4852ed504b160", "4fd4852ed504b160", "4e717c219268b736", "4e717c219268b736", 
"4e717c219268b736", "506bb429eeab2af4", "506bb429eeab2af4", "506bb429eeab2af4", 
"4f6f91cb83e1a7ef", "506bb8b62bde3c48", "506bb8b62bde3c48", "506bb8b62bde3c48", 
"506bb8b62bde3c48", "4edff2aeb4df7613", "4edff2aeb4df7613", "506bba652fa6bf78", 
"506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", 
"506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", 
"506a4941b50ca422", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", 
"5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", 
"5036993a16b323d1", "506bb525ffce3add", "506bb6cf52819b5f", "506bb6cf52819b5f", 
"4fe02e08a6a2ce64", "4fe02e08a6a2ce64", "5056d247bc7dae0b", "5056d247bc7dae0b", 
"5056d247bc7dae0b", "506abfecf61bf4e6", "4ec00aa243745ee5", "4ec00aa243745ee5", 
"4ec00aa243745ee5", "4ec00aa243745ee5", "506c69d7fb598afa", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111"), country = c("AR", "AR", 
"AR", "AR", "AR", "BG", "BG", "SK", "SK", "US", "VE", "VE", "VE", 
"VE", "VE", "VE", "VE", "TH", "TH", "TH", "MY", "US", "US", "US", 
"US", "MX", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", 
"JP", "US", "US", "US", "US", "US", "US", "NP", "US", "US", "US", 
"US", "MX", "MX", "US", "AR", "AR", "AR", "AR", "AR", "AR", "AR", 
"AR", "AR", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "US", 
"MM", "MM", "US", "US", "IN", "IN", "IN", "IN", "CA", "CA", "CA", 
"CA", "US", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", 
"DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE"), 
    language = c("es", "es", "es", "es", "es", "bg", "bg", "sk", 
    "sk", "en", "es", "es", "es", "es", "es", "es", "es", "th", 
    "th", "th", "en", "en", "en", "en", "en", "es", "en", "en", 
    "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", 
    "en", "en", "en", "en", "en", "en", "en", "en", "en", "es", 
    "es", "en", "es", "es", "es", "es", "es", "es", "es", "es", 
    "es", "es", "es", "es", "es", "es", "es", "es", "es", "en", 
    "en", "en", "en", "en", "en", "en", "en", "en", "fr", "fr", 
    "fr", "fr", "en", "de", "de", "de", "de", "de", "de", "de", 
    "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", 
    "de", "de")), .Names = c("user", "country", "language"), row.names = c(1L, 
2L, 3229330L, 3229331L, 3229332L, 3L, 9504492L, 4L, 9504493L, 
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 4989444L, 
4989445L, 4989446L, 18L, 19L, 20L, 21L, 22L, 95114L, 95115L, 
95116L, 95117L, 9504509L, 9599604L, 23L, 3319263L, 3319264L, 
24L, 25L, 9504513L, 26L, 27L, 28L, 7548719L, 7548720L, 29L, 30L, 
31L, 32L, 2498862L, 2498863L, 2498864L, 2498865L, 4560918L, 4560919L, 
5938642L, 14065408L, 33L, 4348151L, 4348152L, 5627634L, 5627635L, 
9504522L, 13852641L, 15132124L, 34L, 35L, 6400711L, 36L, 2173763L, 
37L, 38L, 39L, 40L, 41L, 10210641L, 10672811L, 16158334L, 42L, 
43L, 44L, 45L, 1974646L, 4032952L, 4032953L, 4032954L, 4032955L, 
4032956L, 4032957L, 4032958L, 4475376L, 4475377L, 4475378L, 4475379L, 
5500564L, 7871329L, 7871330L, 8670694L), class = "data.frame")
--8<---------------cut here---------------end--------------->8---

thanks!
* R. Michael Weylandt <zvpunry.jrlynaqg at tznvy.pbz> [2012-10-16 16:19:27 +0100]:

Have you looked at using table() directly? If I understand what you
want correctly something like:

table(do.call(paste, x))
I wished to avoid paste (I will have to re-split later, so it will be a
performance nightmare).
Also, if you take a look at the development version of R, changes are
being put in place to allow much larger data sets.
xtabs(), although dog slow, would have footed the bill nicely:
--8<---------------cut here---------------start------------->8---
x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32)
system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))
   user  system elapsed
 12.788   4.288  17.224
--8<---------------cut here---------------end--------------->8---
you should not need "much larger data sets" for this.
x is sorted.
Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000
http://www.childpsy.net/ http://openvotingconsortium.org http://iris.org.il
http://www.memritv.org http://memri.org http://think-israel.org
Just because you're paranoid doesn't mean they AREN'T after you.
* R. Michael Weylandt <zvpunry.jrlynaqg at tznvy.pbz> [2012-10-16 16:19:27 +0100]:

Have you looked at using table() directly? If I understand what you
want correctly something like:

table(do.call(paste, x))
I wished to avoid paste (I will have to re-split later, so it will be a
performance nightmare).

Also, if you take a look at the development version of R, changes are
being put in place to allow much larger data sets.
xtabs(), although dog slow, would have footed the bill nicely:
--8<---------------cut here---------------start------------->8---
x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32)
system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))
   user  system elapsed
 12.788   4.288  17.224
--8<---------------cut here---------------end--------------->8---
you should not need "much larger data sets" for this.
x is sorted.
The problem is that xtabs() and by() and related functions are designed 
for the case where all combinations of all factors exist. If you have a 
dataset where only a few exist, you could use sparseby() from the 
reshape package.

Syntax would be

sparseby(data=x, INDICES=x, FUN=nrow)

if you wanted a dataframe giving counts.

I just tried it, and on your two examples it gives a warning about 
coercing a list to a logical vector; I guess all(list(TRUE, TRUE)) was 
allowed when I wrote it, but isn't any more.  I'll send a patch to the 
maintainer.

Duncan Murdoch
* Duncan Murdoch <zheqbpu.qhapna at tznvy.pbz> [2012-10-16 12:47:36 -0400]:

On 16/10/2012 12:29 PM, Sam Steingold wrote:
x is sorted.
sparseby(data=x, INDICES=x, FUN=nrow)
this takes forever; apparently, it does not use the fact that x is
sorted (even then - it should not take more than a few minutes)...
Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000
http://www.childpsy.net/ http://openvotingconsortium.org http://www.memritv.org
http://think-israel.org http://pmw.org.il http://thereligionofpeace.com
Save the whales, feed the hungry, free the mallocs.
* Duncan Murdoch <zheqbpu.qhapna at tznvy.pbz> [2012-10-16 12:47:36 -0400]:
sparseby(data=x, INDICES=x, FUN=nrow)
Error in `[<-.data.frame`(`*tmp*`, index, , value = list(user = c(2L,  : 
  missing values are not allowed in subscripted assignments of data frames
Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000
http://www.childpsy.net/ http://camera.org http://mideasttruth.com
http://palestinefacts.org http://www.memritv.org http://thereligionofpeace.com
Diplomacy is the art of saying "nice doggy" until you can find a nice rock.
* Duncan Murdoch <zheqbpu.qhapna at tznvy.pbz> [2012-10-16 12:47:36 -0400]:

On 16/10/2012 12:29 PM, Sam Steingold wrote:
x is sorted.
sparseby(data=x, INDICES=x, FUN=nrow)
this takes forever; apparently, it does not use the fact that x is
sorted (even then - it should not take more than a few minutes)...

It was more or less instantaneous on the examples you posted.  It would 
be a bit more honest to say "it was fast on the examples, but it was 
very slow when I ran it on my real data, which consists of 
100000000000000 cases."

Duncan Murdoch
* Duncan Murdoch <zheqbpu.qhapna at tznvy.pbz> [2012-10-16 14:22:51 -0400]:

On 16/10/2012 1:46 PM, Sam Steingold wrote:
* Duncan Murdoch <zheqbpu.qhapna at tznvy.pbz> [2012-10-16 12:47:36 -0400]:

On 16/10/2012 12:29 PM, Sam Steingold wrote:
x is sorted.
sparseby(data=x, INDICES=x, FUN=nrow)
this takes forever; apparently, it does not use the fact that x is
sorted (even then - it should not take more than a few minutes)...
It was more or less instantaneous on the examples you posted.  It
would be a bit more honest to say "it was fast on the examples, but it
was very slow when I ran it on my real data, which consists of
100000000000000 cases."
sure, I did not mean any insult to your code, sorry.
all I was saying was that it was too slow for my purposes because it
ignores the fact that the data is sorted.
it turned out that paste+sort+rle+strsplit is fast enough.
(although there should be a way to avoid paste/strsplit!)
Thanks!
Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000
http://www.childpsy.net/ http://camera.org http://truepeace.org
http://jihadwatch.org http://www.PetitionOnline.com/tap12009/
Every day above ground is a good day.
You said you wanted the equivalent of the Unix 'uniq -c' but said
that xtab's results were roughly right and the rle might be what
you want.  rle() is the equivalent of 'uniq -c', they both output the
lengths of runs of identical elements.   if the data is sorted they
are equivalent to using table() or xtabs().

Since you have sorted data try the following

isFirstInRun <- function(x) UseMethod("isFirstInRun")
isFirstInRun.default <- function(x) c(TRUE, x[-1] != x[-length(x)])
isFirstInRun.data.frame <- function(x) {
    stopifnot(ncol(x)>0)
    retval <- isFirstInRun(x[[1]])
    for(column in x) {
        retval <- retval | isFirstInRun(column)
    }
    retval
}

i <- which(isFirstInRun(yourDataFrame))

Then I think
   data.frame(Count=diff(c(i, 1L+nrow(yourDataFrame))), yourDataFrame[i,])
gives you what you want.   E.g.,
  > yourDataFrame <- data.frame(x1=c(1,1,2,2,1), x2=c(11,11,11,12,11))
  > i <- which(isFirstInRun(yourDataFrame))
  > i
  [1] 1 3 4 5
  > data.frame(Count=diff(c(i, 1L+nrow(yourDataFrame))), yourDataFrame[i,])
    Count x1 x2
  1     2  1 11
  3     1  2 11
  4     1  2 12
  5     1  1 11

It should be pretty quick.  If you have missing values in your data frame,
you will have to make some decisions about whether they should be
considered equal to each other or not and modify isFirstInRun.default
accordingly.

Bill Dunlap
Spotfire, TIBCO Software
wdunlap tibco.com
-----Original Message-----
From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf
Of Sam Steingold
Sent: Tuesday, October 16, 2012 10:46 AM
To: r-help at r-project.org; Duncan Murdoch
Subject: Re: [R] uniq -c

* Duncan Murdoch <zheqbpu.qhapna at tznvy.pbz> [2012-10-16 12:47:36 -0400]:

On 16/10/2012 12:29 PM, Sam Steingold wrote:
x is sorted.
sparseby(data=x, INDICES=x, FUN=nrow)
this takes forever; apparently, it does not use the fact that x is
sorted (even then - it should not take more than a few minutes)...

--
Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000
http://www.childpsy.net/ http://openvotingconsortium.org http://www.memritv.org
http://think-israel.org http://pmw.org.il http://thereligionofpeace.com
Save the whales, feed the hungry, free the mallocs.

______________________________________________
R-help at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
* Sam Steingold <fqf at tah.bet> [2012-10-16 11:03:27 -0400]:

I need an analogue of "uniq -c" for a data frame.
Summary of options:

1. William:

isFirstInRun <- function(x) UseMethod("isFirstInRun")
isFirstInRun.default <- function(x) c(TRUE, x[-1] != x[-length(x)])
isFirstInRun.data.frame <- function(x) {
  stopifnot(ncol(x)>0)
  retval <- isFirstInRun(x[[1]])
  for(column in x) {
    retval <- retval | isFirstInRun(column)
  }
  retval
}
row.count.1 <- function (x) {
  i <- which(isFirstInRun(x))
  data.frame(x[i,], count=diff(c(i, 1L+nrow(x))))
}

147 seconds

2. http://orgmode.org/worg/org-contrib/babel/examples/Rpackage.html#sec-6-1
row.count.2 <- function (x) {
  equal.to.previous <- rowSums( x[2:nrow(x),] != x[1:(nrow(x)-1),] )==0
  tf.runs <- rle(equal.to.previous)
  counts <- c(1, unlist(mapply(function(x,y) if (y) x+1 else (rep(1,x)),
                               tf.runs$length, tf.runs$value)))
  counts <- counts[ c( diff( counts ) <= 0, TRUE ) ]
  unique.rows <- which( c(TRUE, !equal.to.previous ) )
  cbind(x[ unique.rows, ,drop=FALSE ], counts)
}

136 seconds

3. Micael: paste/strsplit

row.count.3 <- function (x) {
  pa <- do.call(paste,x)
  rl <- rle(p)
  sp <- strsplit(as.character(rl$values)," ")
  data.frame(user = sapply(sp,"[",1),
             country = sapply(sp,"[",2),
             language = sapply(sp,"[",3),
             count = rl$length)
}

here I know the columns and rely on absense of spaces in values.

27 seconds.

Thanks to all who answered.
Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000
http://www.childpsy.net/ http://www.PetitionOnline.com/tap12009/
http://thereligionofpeace.com http://ffii.org http://camera.org
A slave dreams not of Freedom, but of owning his own slaves.
Note that the relative speeds of these, which all use basically the same run-length-encoding
algorithm, depend on the nature of the dataset.  I made a million row data.frame with 10,000
unique users, 26 unique countries, and 6 unique languages with c. 3/4 million unique
rows.  Then the times for methods 1, 2, and 3 were 0.7, 6.2, and 10.5 seconds,
respectively.  With a million row data.frame with 100, 10, and 4 unique users, countries,
and languages, with 4000 unique rows, the times were 0.3, 1.4, and 0.7.

Bill Dunlap
Spotfire, TIBCO Software
wdunlap tibco.com
-----Original Message-----
From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf
Of Sam Steingold
Sent: Wednesday, October 17, 2012 12:58 PM
To: r-help at r-project.org
Subject: Re: [R] uniq -c

* Sam Steingold <fqf at tah.bet> [2012-10-16 11:03:27 -0400]:

I need an analogue of "uniq -c" for a data frame.
Summary of options:

1. William:

isFirstInRun <- function(x) UseMethod("isFirstInRun")
isFirstInRun.default <- function(x) c(TRUE, x[-1] != x[-length(x)])
isFirstInRun.data.frame <- function(x) {
  stopifnot(ncol(x)>0)
  retval <- isFirstInRun(x[[1]])
  for(column in x) {
    retval <- retval | isFirstInRun(column)
  }
  retval
}
row.count.1 <- function (x) {
  i <- which(isFirstInRun(x))
  data.frame(x[i,], count=diff(c(i, 1L+nrow(x))))
}

147 seconds

2. http://orgmode.org/worg/org-contrib/babel/examples/Rpackage.html#sec-6-1
row.count.2 <- function (x) {
  equal.to.previous <- rowSums( x[2:nrow(x),] != x[1:(nrow(x)-1),] )==0
  tf.runs <- rle(equal.to.previous)
  counts <- c(1, unlist(mapply(function(x,y) if (y) x+1 else (rep(1,x)),
                               tf.runs$length, tf.runs$value)))
  counts <- counts[ c( diff( counts ) <= 0, TRUE ) ]
  unique.rows <- which( c(TRUE, !equal.to.previous ) )
  cbind(x[ unique.rows, ,drop=FALSE ], counts)
}

136 seconds

3. Micael: paste/strsplit

row.count.3 <- function (x) {
  pa <- do.call(paste,x)
  rl <- rle(p)
  sp <- strsplit(as.character(rl$values)," ")
  data.frame(user = sapply(sp,"[",1),
             country = sapply(sp,"[",2),
             language = sapply(sp,"[",3),
             count = rl$length)
}

here I know the columns and rely on absense of spaces in values.

27 seconds.

Thanks to all who answered.

--
Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000
http://www.childpsy.net/ http://www.PetitionOnline.com/tap12009/
http://thereligionofpeace.com http://ffii.org http://camera.org
A slave dreams not of Freedom, but of owning his own slaves.

______________________________________________
R-help at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
In addition, adding a factor method for isFirstInRun speeds it up on
long factor variables by c. 60%.

isFirstInRun.factor <- function(x)isFirstInRun(as.integer(x))

Bill Dunlap
Spotfire, TIBCO Software
wdunlap tibco.com
-----Original Message-----
From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf
Of William Dunlap
Sent: Wednesday, October 17, 2012 2:11 PM
To: sds at gnu.org; r-help at r-project.org
Subject: Re: [R] uniq -c

Note that the relative speeds of these, which all use basically the same run-length-
encoding
algorithm, depend on the nature of the dataset.  I made a million row data.frame with
10,000
unique users, 26 unique countries, and 6 unique languages with c. 3/4 million unique
rows.  Then the times for methods 1, 2, and 3 were 0.7, 6.2, and 10.5 seconds,
respectively.  With a million row data.frame with 100, 10, and 4 unique users, countries,
and languages, with 4000 unique rows, the times were 0.3, 1.4, and 0.7.

Bill Dunlap
Spotfire, TIBCO Software
wdunlap tibco.com

-----Original Message-----
From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf
Of Sam Steingold
Sent: Wednesday, October 17, 2012 12:58 PM
To: r-help at r-project.org
Subject: Re: [R] uniq -c

* Sam Steingold <fqf at tah.bet> [2012-10-16 11:03:27 -0400]:

I need an analogue of "uniq -c" for a data frame.
Summary of options:

1. William:

isFirstInRun <- function(x) UseMethod("isFirstInRun")
isFirstInRun.default <- function(x) c(TRUE, x[-1] != x[-length(x)])
isFirstInRun.data.frame <- function(x) {
  stopifnot(ncol(x)>0)
  retval <- isFirstInRun(x[[1]])
  for(column in x) {
    retval <- retval | isFirstInRun(column)
  }
  retval
}
row.count.1 <- function (x) {
  i <- which(isFirstInRun(x))
  data.frame(x[i,], count=diff(c(i, 1L+nrow(x))))
}

147 seconds

2. http://orgmode.org/worg/org-contrib/babel/examples/Rpackage.html#sec-6-1
row.count.2 <- function (x) {
  equal.to.previous <- rowSums( x[2:nrow(x),] != x[1:(nrow(x)-1),] )==0
  tf.runs <- rle(equal.to.previous)
  counts <- c(1, unlist(mapply(function(x,y) if (y) x+1 else (rep(1,x)),
                               tf.runs$length, tf.runs$value)))
  counts <- counts[ c( diff( counts ) <= 0, TRUE ) ]
  unique.rows <- which( c(TRUE, !equal.to.previous ) )
  cbind(x[ unique.rows, ,drop=FALSE ], counts)
}

136 seconds

3. Micael: paste/strsplit

row.count.3 <- function (x) {
  pa <- do.call(paste,x)
  rl <- rle(p)
  sp <- strsplit(as.character(rl$values)," ")
  data.frame(user = sapply(sp,"[",1),
             country = sapply(sp,"[",2),
             language = sapply(sp,"[",3),
             count = rl$length)
}

here I know the columns and rely on absense of spaces in values.

27 seconds.

Thanks to all who answered.

--
Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000
http://www.childpsy.net/ http://www.PetitionOnline.com/tap12009/
http://thereligionofpeace.com http://ffii.org http://camera.org
A slave dreams not of Freedom, but of owning his own slaves.

______________________________________________
R-help at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.