Skip to content

uniq -c

13 messages · R. Michael Weylandt, Charles C. Berry, Duncan Murdoch +2 more

#
I need an analogue of "uniq -c" for a data frame.

xtabs(), although dog slow, would have footed the bill nicely:
--8<---------------cut here---------------start------------->8---
user  system elapsed 
 12.788   4.288  17.224
--8<---------------cut here---------------end--------------->8---
but, alas, if fails on larger data:
system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))
Error in table(a = 1:32, b = 1:32, c = 1:32, d = 1:32, e = 1:32, f = 1:32,  : 
  attempt to make a table with >= 2^31 elements
(apparently, because the product of the numbers of all the possible
values of all the columns is too large).

rle() seems to be what I really need, but I cannot figure out what it
returns for a simple example:
--8<---------------cut here---------------start------------->8---
Run Length Encoding
  lengths: int 8
  values :'data.frame':	32 obs. of  1 variable:
 $ h: int  1 2 3 4 5 6 7 8 9 10 ...
--8<---------------cut here---------------end--------------->8---
(where are all the other columns?)

and it fails on my actual data (3 column of factors):
Error in Ops.factor(left, right) : level sets of factors are different

when I replace factors with strings, I get:
Error in `[.data.frame`(x, i) : undefined columns selected

dput:
--8<---------------cut here---------------start------------->8---
structure(list(user = c("45ff768774777593", "45ff768774777593", 
"45ff768774777593", "45ff768774777593", "45ff768774777593", "4bbf9e94cbceb70c", 
"4bbf9e94cbceb70c", "4fbbf2c67e0fb867", "4fbbf2c67e0fb867", "5038d46739f9f516", 
"4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", 
"4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4fe9e0496ecfc55e", 
"4fe9e0496ecfc55e", "4fe9e0496ecfc55e", "506b92707a3aa65f", "502c0a9ba9ce5019", 
"502c0a9ba9ce5019", "502c0a9ba9ce5019", "502c0a9ba9ce5019", "501b52fe24d88162", 
"4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", 
"4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", 
"4fd4852ed504b160", "4fd4852ed504b160", "4e717c219268b736", "4e717c219268b736", 
"4e717c219268b736", "506bb429eeab2af4", "506bb429eeab2af4", "506bb429eeab2af4", 
"4f6f91cb83e1a7ef", "506bb8b62bde3c48", "506bb8b62bde3c48", "506bb8b62bde3c48", 
"506bb8b62bde3c48", "4edff2aeb4df7613", "4edff2aeb4df7613", "506bba652fa6bf78", 
"506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", 
"506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", 
"506a4941b50ca422", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", 
"5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", 
"5036993a16b323d1", "506bb525ffce3add", "506bb6cf52819b5f", "506bb6cf52819b5f", 
"4fe02e08a6a2ce64", "4fe02e08a6a2ce64", "5056d247bc7dae0b", "5056d247bc7dae0b", 
"5056d247bc7dae0b", "506abfecf61bf4e6", "4ec00aa243745ee5", "4ec00aa243745ee5", 
"4ec00aa243745ee5", "4ec00aa243745ee5", "506c69d7fb598afa", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
"5065a0c59c59c111", "5065a0c59c59c111"), country = c("AR", "AR", 
"AR", "AR", "AR", "BG", "BG", "SK", "SK", "US", "VE", "VE", "VE", 
"VE", "VE", "VE", "VE", "TH", "TH", "TH", "MY", "US", "US", "US", 
"US", "MX", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", 
"JP", "US", "US", "US", "US", "US", "US", "NP", "US", "US", "US", 
"US", "MX", "MX", "US", "AR", "AR", "AR", "AR", "AR", "AR", "AR", 
"AR", "AR", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "US", 
"MM", "MM", "US", "US", "IN", "IN", "IN", "IN", "CA", "CA", "CA", 
"CA", "US", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", 
"DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE"), 
    language = c("es", "es", "es", "es", "es", "bg", "bg", "sk", 
    "sk", "en", "es", "es", "es", "es", "es", "es", "es", "th", 
    "th", "th", "en", "en", "en", "en", "en", "es", "en", "en", 
    "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", 
    "en", "en", "en", "en", "en", "en", "en", "en", "en", "es", 
    "es", "en", "es", "es", "es", "es", "es", "es", "es", "es", 
    "es", "es", "es", "es", "es", "es", "es", "es", "es", "en", 
    "en", "en", "en", "en", "en", "en", "en", "en", "fr", "fr", 
    "fr", "fr", "en", "de", "de", "de", "de", "de", "de", "de", 
    "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", 
    "de", "de")), .Names = c("user", "country", "language"), row.names = c(1L, 
2L, 3229330L, 3229331L, 3229332L, 3L, 9504492L, 4L, 9504493L, 
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 4989444L, 
4989445L, 4989446L, 18L, 19L, 20L, 21L, 22L, 95114L, 95115L, 
95116L, 95117L, 9504509L, 9599604L, 23L, 3319263L, 3319264L, 
24L, 25L, 9504513L, 26L, 27L, 28L, 7548719L, 7548720L, 29L, 30L, 
31L, 32L, 2498862L, 2498863L, 2498864L, 2498865L, 4560918L, 4560919L, 
5938642L, 14065408L, 33L, 4348151L, 4348152L, 5627634L, 5627635L, 
9504522L, 13852641L, 15132124L, 34L, 35L, 6400711L, 36L, 2173763L, 
37L, 38L, 39L, 40L, 41L, 10210641L, 10672811L, 16158334L, 42L, 
43L, 44L, 45L, 1974646L, 4032952L, 4032953L, 4032954L, 4032955L, 
4032956L, 4032957L, 4032958L, 4475376L, 4475377L, 4475378L, 4475379L, 
5500564L, 7871329L, 7871330L, 8670694L), class = "data.frame")
--8<---------------cut here---------------end--------------->8---

thanks!
#
Have you looked at using table() directly? If I understand what you
want correctly something like:

table(do.call(paste, x))

Also, if you take a look at the development version of R, changes are
being put in place to allow much larger data sets.

Cheers,
Micael
On Tue, Oct 16, 2012 at 4:03 PM, Sam Steingold <sds at gnu.org> wrote:
#
Sam Steingold <sds at gnu.org> writes:
The count.rows() function is the R analogue.

See

http://orgmode.org/worg/org-contrib/babel/examples/Rpackage.html#sec-6-1


No need to install the package - just copy and paste the function into an
R session.

On cases I've tried that are big enough to matter, it is a good deal
faster than the table( do.call( paste, x )) idiom.

HTH,

Chuck
#
I wished to avoid paste (I will have to re-split later, so it will be a
performance nightmare).
you should not need "much larger data sets" for this.
x is sorted.
#
On 16/10/2012 12:29 PM, Sam Steingold wrote:
The problem is that xtabs() and by() and related functions are designed 
for the case where all combinations of all factors exist. If you have a 
dataset where only a few exist, you could use sparseby() from the 
reshape package.

Syntax would be

sparseby(data=x, INDICES=x, FUN=nrow)

if you wanted a dataframe giving counts.

I just tried it, and on your two examples it gives a warning about 
coercing a list to a logical vector; I guess all(list(TRUE, TRUE)) was 
allowed when I wrote it, but isn't any more.  I'll send a patch to the 
maintainer.

Duncan Murdoch
#
this takes forever; apparently, it does not use the fact that x is
sorted (even then - it should not take more than a few minutes)...
#
Error in `[<-.data.frame`(`*tmp*`, index, , value = list(user = c(2L,  : 
  missing values are not allowed in subscripted assignments of data frames
#
On 16/10/2012 1:46 PM, Sam Steingold wrote:
It was more or less instantaneous on the examples you posted.  It would 
be a bit more honest to say "it was fast on the examples, but it was 
very slow when I ran it on my real data, which consists of 
100000000000000 cases."

Duncan Murdoch
#
sure, I did not mean any insult to your code, sorry.
all I was saying was that it was too slow for my purposes because it
ignores the fact that the data is sorted.
it turned out that paste+sort+rle+strsplit is fast enough.
(although there should be a way to avoid paste/strsplit!)
Thanks!
#
You said you wanted the equivalent of the Unix 'uniq -c' but said
that xtab's results were roughly right and the rle might be what
you want.  rle() is the equivalent of 'uniq -c', they both output the
lengths of runs of identical elements.   if the data is sorted they
are equivalent to using table() or xtabs().

Since you have sorted data try the following

isFirstInRun <- function(x) UseMethod("isFirstInRun")
isFirstInRun.default <- function(x) c(TRUE, x[-1] != x[-length(x)])
isFirstInRun.data.frame <- function(x) {
    stopifnot(ncol(x)>0)
    retval <- isFirstInRun(x[[1]])
    for(column in x) {
        retval <- retval | isFirstInRun(column)
    }
    retval
}

i <- which(isFirstInRun(yourDataFrame))

Then I think
   data.frame(Count=diff(c(i, 1L+nrow(yourDataFrame))), yourDataFrame[i,])
gives you what you want.   E.g.,
  > yourDataFrame <- data.frame(x1=c(1,1,2,2,1), x2=c(11,11,11,12,11))
  > i <- which(isFirstInRun(yourDataFrame))
  > i
  [1] 1 3 4 5
  > data.frame(Count=diff(c(i, 1L+nrow(yourDataFrame))), yourDataFrame[i,])
    Count x1 x2
  1     2  1 11
  3     1  2 11
  4     1  2 12
  5     1  1 11

It should be pretty quick.  If you have missing values in your data frame,
you will have to make some decisions about whether they should be
considered equal to each other or not and modify isFirstInRun.default
accordingly.

Bill Dunlap
Spotfire, TIBCO Software
wdunlap tibco.com
#
Summary of options:

1. William:

isFirstInRun <- function(x) UseMethod("isFirstInRun")
isFirstInRun.default <- function(x) c(TRUE, x[-1] != x[-length(x)])
isFirstInRun.data.frame <- function(x) {
  stopifnot(ncol(x)>0)
  retval <- isFirstInRun(x[[1]])
  for(column in x) {
    retval <- retval | isFirstInRun(column)
  }
  retval
}
row.count.1 <- function (x) {
  i <- which(isFirstInRun(x))
  data.frame(x[i,], count=diff(c(i, 1L+nrow(x))))
}

147 seconds

2. http://orgmode.org/worg/org-contrib/babel/examples/Rpackage.html#sec-6-1
row.count.2 <- function (x) {
  equal.to.previous <- rowSums( x[2:nrow(x),] != x[1:(nrow(x)-1),] )==0
  tf.runs <- rle(equal.to.previous)
  counts <- c(1, unlist(mapply(function(x,y) if (y) x+1 else (rep(1,x)),
                               tf.runs$length, tf.runs$value)))
  counts <- counts[ c( diff( counts ) <= 0, TRUE ) ]
  unique.rows <- which( c(TRUE, !equal.to.previous ) )
  cbind(x[ unique.rows, ,drop=FALSE ], counts)
}

136 seconds

3. Micael: paste/strsplit

row.count.3 <- function (x) {
  pa <- do.call(paste,x)
  rl <- rle(p)
  sp <- strsplit(as.character(rl$values)," ")
  data.frame(user = sapply(sp,"[",1),
             country = sapply(sp,"[",2),
             language = sapply(sp,"[",3),
             count = rl$length)
}

here I know the columns and rely on absense of spaces in values.

27 seconds.

Thanks to all who answered.
#
Note that the relative speeds of these, which all use basically the same run-length-encoding
algorithm, depend on the nature of the dataset.  I made a million row data.frame with 10,000
unique users, 26 unique countries, and 6 unique languages with c. 3/4 million unique
rows.  Then the times for methods 1, 2, and 3 were 0.7, 6.2, and 10.5 seconds,
respectively.  With a million row data.frame with 100, 10, and 4 unique users, countries,
and languages, with 4000 unique rows, the times were 0.3, 1.4, and 0.7.

Bill Dunlap
Spotfire, TIBCO Software
wdunlap tibco.com
#
In addition, adding a factor method for isFirstInRun speeds it up on
long factor variables by c. 60%.

isFirstInRun.factor <- function(x)isFirstInRun(as.integer(x))

Bill Dunlap
Spotfire, TIBCO Software
wdunlap tibco.com