Skip to content

IP-Address

15 messages · Peter Dalgaard, Erich Neuwirth, Allan Engelhardt +5 more

#
IP addresses are very (very!) difficult to parse and sort correctly 
because there are all sorts of supported formats.  Try to use something 
like PostgreSQL instead: it is already implemented there.  But if you 
are sure all your data is of the n.n.n.n form, then something along the 
lines of the following should basically work (I have chosen some more 
interesting IP addresses for this):


a <- data.frame(cbind(id=c(138,138,138,138),
                      rank=c(29746,29746,29746,29746),
                      color=c("yellow","red","blue","red"),
                      status=c("no","yes","yes","no"),
                      
ip=c("162.131.58.26","2.131.58.16","2.2.58.10","162.131.58.17")))
a
#    id  rank  color status            ip
# 1 138 29746 yellow     no 162.131.58.26
# 2 138 29746    red    yes   2.131.58.16
# 3 138 29746   blue    yes     2.2.58.10
# 4 138 29746    red     no 162.131.58.17
x <- matrix(unlist(lapply(strsplit(as.character(a$ip), ".", fixed=TRUE), 
as.integer)),
            ncol=4, byrow=TRUE)
a[order(x[,1],x[,2],x[,3],x[,4]),]
#    id  rank  color status            ip
# 3 138 29746   blue    yes     2.2.58.10
# 2 138 29746    red    yes   2.131.58.16
# 4 138 29746    red     no 162.131.58.17
# 1 138 29746 yellow     no 162.131.58.26


Getting rid of the conversions including the matrix(unlist) combo is 
left as an exercise (it's too hot here....)

Allan.
edwin Sendjaja wrote:
#
Allan Engelhardt wrote:
Here's one way:

con <- textConnection(as.character(a$ip))
o <- do.call(order,read.table(con,sep="."))
close(con)
a[o,]
#
normalizedip <- function(ipstring){
  ipsepstring <- strsplit(ipstring,"\\.")[[1]]
  cat(sapply(ipsepstring,function(x)
       sprintf("%03i",as.numeric(x))),sep=".")
}

normalizedip("1.2.3.55")
yields
 "001.002.003.055"
and therefore should allow you to sort in correct order.
edwin Sendjaja wrote:

  
    
#
Peter Dalgaard wrote:
here's another:

    library(gsubfn)
    a[order(gsubfn(
        '[0-9]+',
        ~ sprintf('%03d', as.integer(x)),
        as.character(a$ip))),]

vQ
1 day later
#
Here is yet another way:

library(gtools)
DF[mixedorder(DF$ip), ]
On Fri, May 29, 2009 at 12:51 AM, edwin Sendjaja <edwin_0712 at msn.com> wrote:
#
edwin Sendjaja wrote:
i think peter's code is more r-elegant, though less generic.  here's a
quick test, with not so surprising results.  gsubfn is implemented in r,
not c, and it is painfully slow in this test. i also added gabor's
suggestion.

    library(gsubfn)
    library(gtools)
    library(rbenchmark)

    n = 1000
    df = data.frame(
       a=rnorm(n),
       b = rnorm(n),
       c = rnorm(n),
       ip = replicate(n, paste(sample(255, 4), collapse='.'),
simplify=TRUE))
    benchmark(columns=c('test', 'elapsed'), replications=10, order=NULL,
       peda={
          connection = textConnection(as.character(df$ip))
          o = do.call(order, read.table(connection, sep='.'))
          close(connection)
          df[o, ] },
       waku=df[order(gsubfn(perl=TRUE,
          '[0-9]+',
          ~ sprintf('%03d', as.integer(x)),
          as.character(df$ip))), ],
       gagr=df[mixedorder(df$ip), ] )
         
    # peda 0.070
    # waku 7.070
    # gagr 4.710


vQ
#
library(gsubfn)
library(gtools)
library(rbenchmark)

n <- 10000
df <- data.frame(
  a = rnorm(n),
  b = rnorm(n),
  c = rnorm(n),
  ip = replicate(n, paste(sample(255, 4), collapse='.'), simplify=TRUE)
)

res <- benchmark(columns=c('test', 'elapsed'), replications=10, order=NULL,
  peda = {
    connection <- textConnection(as.character(df$ip))
    o <- do.call(order, read.table(connection, sep='.'))
    close(connection)
    df[o, ]
  },

  peda2 = {
    connection <- textConnection(as.character(df$ip))
    dfT <- read.table(connection, sep='.', colClasses=rep("integer",
4), quote="", na.strings=NULL, blank.lines.skip=FALSE)
    close(connection)
    o <- do.call(order, dfT)
    df[o, ]
  },

  hb = {
    ip <- strsplit(as.character(df$ip), split=".", fixed=TRUE)
    ip <- unlist(ip, use.names=FALSE)
    ip <- as.integer(ip)
    dim(ip) <- c(4, nrow(df))
    ip <- 256^3*ip[1,] + 256^2*ip[2,] + 256*ip[3,] + ip[4,]
    o <- order(ip)
    df[o, ]
  },

  hb2 = {
    ip <- strsplit(as.character(df$ip), split=".", fixed=TRUE)
    ip <- unlist(ip, use.names=FALSE)
    ip <- as.integer(ip);
    dim(ip) <- c(4, nrow(df))
    o <- sort.list(ip[4,], method="radix", na.last=TRUE)
    for (kk in 3:1) {
      o <- o[sort.list(ip[kk,o], method="radix", na.last=TRUE)]
    }
    df[o, ]
  }
)

print(res)

   test elapsed
1  peda    4.12
2 peda2    4.08
3    hb    0.28
4   hb2    0.25


On Sun, May 31, 2009 at 12:42 AM, Wacek Kusnierczyk
<Waclaw.Marcin.Kusnierczyk at idi.ntnu.no> wrote:
#
wow! :)

vQ
Henrik Bengtsson wrote:
#
Not really, just the old saying that any piece of code can be made
twice as fast (which often holds true recursively). /Henrik

On Sun, May 31, 2009 at 1:58 PM, Wacek Kusnierczyk
<Waclaw.Marcin.Kusnierczyk at idi.ntnu.no> wrote: