all.equal() improvements (PR#8191) - R-devel

Sun, Oct 9, 2005 10:08 AM #

--k1lZvvs/B4yU6o8G
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

The attached patch against R 2.2.0 makes the following improvements to
the all.equal() function:

1. Check names!  Stock R all.equal() (unlike S-Plus) ignores names
   completely on some objects.  I consider this wrong - if the names
   are different, the object is NOT "the same".

2. When a difference is detected, return better output to help the
   user understand just WHAT is different.

Further details are included in the code comments, but in particular,
all.equal.list() is much enhanced.  By default it still checks list
values by postion rather than name, as that behavior is more strict
and thus more correct.

But when using the by.name="auto" and by.pos=TRUE options (which are
the defaults), in addition to by-positing differences,
all.equal.list() now also reports by-name differences in those places
(and only those places) where doing so should be helpful to the user.

Also, optionally, using by.name=TRUE and by.pos=FALSE will give
behavior like S-Plus.

The attached patch is also available here:

  http://www.piskorski.com/R/patches/all-equal-patch-20051009.txt

If you want to see the entire file rather than a patch against R
2.2.0, that is also available here:

  http://www.piskorski.com/R/patches/all.equal.R

Andrew Piskorski <atp at piskorski.com>
http://www.piskorski.com/

--k1lZvvs/B4yU6o8G
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="all-equal-patch-20051009.txt"

Index: all.equal.R
===================================================================
RCS file: /home/cvsroot/dtk/Splus/patches/all.equal.R,v
retrieving revision 1.1.1.1
retrieving revision 1.4
diff -u -r1.1.1.1 -r1.4
--- all.equal.R	1 Oct 2005 06:51:06 -0000	1.1.1.1
+++ all.equal.R	1 Oct 2005 13:10:25 -0000	1.4
@@ -1,4 +1,75 @@
-all.equal <- function(target, current, ...) UseMethod("all.equal")
+#
+# This is a copy of "src/library/base/R/all.equal.R" from
+# "R-beta_2005-09-24_r35666.tar.gz", plus our modifications.  (The
+# all.equal.R in that tarball seems to be unchanged at least as far
+# back as R 2.1.0.)
+#
+# Further detail is in the comments in each function, but basically,
+# all modifications here involve either of two sorts of improvements:
+#
+# 1. Check names!  Stock R all.equal() (unlike S-Plus) ignores names
+#    completely on some objects.  I consider this bogus, if the names
+#    are different, the object is NOT "the same".
+#
+# 2. When the object is different, return more output to help the user
+#    understand just WHAT is different.
+#
+# Note: Here in our patches package, we purposely CVS import and than
+# override ALL the base all.equal() methods, NOT just the ones we're
+# actually modifying.  At first I tried only overriding some of them,
+# but in that case, even though package:patches was earlier on the
+# search path than package:base, base methods appeared to
+# preferentially call the original base versions, rather than the
+# patches versions that I wanted.  So, big hammer it, override
+# everything - which will probably make it easier to contribute these
+# improvements back to stock R anyway.
+#
+# --atp at piskorski.com, 2005/10/01 02:29 EDT
+#
+# $Id: all.equal.R,v 1.4 2005/10/01 13:10:25 andy Exp $
+
+
+# In S-Plus, all.equal() prefers to index objects by name, while in
+# stock R, it prefers to index by position.  IMO, *NEITHER* of those
+# behaviors are fully correct.  What we really want is to compare
+# things BOTH by name and by position.
+#
+# Here's ONE example of the effect of these R patches:
+#
+## S-Plus 6.2, no patches to all.equal():
+#> all.equal(list(a=2,2,x=3,zap=1,foo=42,"NA"=T) ,list(b=1,2,y=4,foo=7,zap=1,"NA"=F))
+#[1] "Names: 4 string mismatches"                        
+#[2] "Components not in target: b, y"                    
+#[3] "Components not in current: a, x"                   
+#[4] "Component foo: Mean relative difference: 0.8333333"
+#[5] "Component NA: Mean relative difference: 1"         
+#
+## R 2.1.0, no patches to all.equal():
+#> all.equal(list(a=2,2,x=3,zap=1,foo=42,"NA"=T) ,list(b=1,2,y=4,foo=7,zap=1,"NA"=F))
+#[1] "Names: 4 string mismatches"                       
+#[2] "Component 1: Mean relative  difference: 0.5"      
+#[3] "Component 3: Mean relative  difference: 0.3333333"
+#[4] "Component 4: Mean relative  difference: 6"        
+#[5] "Component 5: Mean relative  difference: 0.9761905"
+#[6] "Component 6: Mean relative  difference: 1"        
+#
+## R 2.1.0 with our patches here:
+#> all.equal(list(a=2,2,x=3,zap=1,foo=42,"NA"=T) ,list(b=1,2,y=4,foo=7,zap=1,"NA"=F))
+# [1] "Names: 4 string mismatches"                         
+# [2] "Components not in target: b, y"                     
+# [3] "Components not in current: a, x"                    
+# [4] "Component foo: Mean relative  difference: 0.8333333"
+# [5] "Component NA: Mean relative  difference: 1"         
+# [6] "Component 1: Mean relative  difference: 0.5"        
+# [7] "Component 3: Mean relative  difference: 0.3333333"  
+# [8] "Component 4: Mean relative  difference: 6"          
+# [9] "Component 5: Mean relative  difference: 0.9761905"  
+#[10] "Component 6: Mean relative  difference: 1"          
+
+
+#all.equal.original.fcn <- get("all.equal" ,pos="package:base")
+all.equal <- function(target, current, ... ,debug.p=FALSE) UseMethod("all.equal")
+
 
 ## NO:  is.*(x) should be like S4  is(x, *) ! -- use  isTRUE(all.equal(*))
 ## is.all.equal <- function(target, current, ...)
@@ -32,23 +103,32 @@
 function(target, current, tolerance = .Machine$double.eps ^ .5,
          scale=NULL, ...)
 {
-    if(data.class(target) != data.class(current))
-        return(paste("target is ", data.class(target), ", current is ",
-		       data.class(current), sep = ""))
+    msg <- attr.all.equal(target, current ,...)
+    if(data.class(target) != data.class(current)) {
+        msg <- c(msg ,paste("target is ", data.class(target), ", current is ",
+                            data.class(current), sep = ""))
+        return(msg)
+    }
+
     lt <- length(target)
     lc <- length(current)
     cplx <- is.complex(target)
-    if(lt != lc)
-	return(paste(if(cplx)"Complex" else "Numeric",
-                     ": lengths (", lt, ", ", lc, ") differ", sep = ""))
+    if(lt != lc) {
+        msg <- c(msg ,paste(if(cplx)"Complex" else "Numeric",
+                            ": lengths (", lt, ", ", lc, ") differ", sep = ""))
+	return(msg)
+    }
     target <- as.vector(target)
     current <- as.vector(current)
     out <- is.na(target)
-    if(any(out != is.na(current)))
-	return(paste("`is.NA' value mismatches:", sum(is.na(current)),
-		     "in current,", sum(out), " in target"))
+    if(any(out != is.na(current))) {
+        msg <- c(msg ,paste("`is.NA' value mismatches:", sum(is.na(current)),
+                            "in current,", sum(out), " in target"))
+	return(msg)
+    }
     out <- out | target == current
-    if(all(out)) return(TRUE)
+    if(all(out)) { if (is.null(msg)) return(TRUE) else return(msg) }
+
     target <- target[!out]
     current <- current[!out]
     xy <- mean((if(cplx)Mod else abs)(target - current))
@@ -63,29 +143,37 @@
 	    xy <- xy/scale
 	    "scaled"
 	}
+
     if(is.na(xy) || xy > tolerance)
-	paste("Mean", what, if(cplx)"Mod", "difference:", format(xy)) else TRUE
+       msg <- c(msg ,paste("Mean", what, if(cplx)"Mod", "difference:", format(xy)))
+
+    if(is.null(msg)) TRUE else msg
 }
 
 all.equal.character <- function(target, current, ...)
 {
-    if(data.class(target) != data.class(current))
-        return(paste("target is ", data.class(target), ", current is ",
-		       data.class(current), sep = ""))
+    msg <- attr.all.equal(target, current ,...)
+    if(data.class(target) != data.class(current)) {
+        msg <- c(msg ,paste("target is ", data.class(target), ", current is ",
+                            data.class(current), sep = ""))
+        return(msg)
+    }
     lt <- length(target)
     lc <- length(current)
     if(lt != lc) {
-	msg <- paste("Lengths (", lt, ", ", lc,
+	msg <- c(msg ,paste("Lengths (", lt, ", ", lc,
 		     ") differ (string compare on first ", ll <- min(lt, lc),
-		     ")", sep = "")
+		     ")", sep = ""))
 	ll <- seq(length = ll)
 	target <- target[ll]
 	current <- current[ll]
-    } else msg <- NULL
+    }
     nas <- is.na(target)
-    if (any(nas != is.na(current)))
-        return(paste("`is.NA' value mismatches:", sum(is.na(current)),
+    if (any(nas != is.na(current))) {
+        msg <- c(msg ,paste("`is.NA' value mismatches:", sum(is.na(current)),
                      "in current,", sum(nas), " in target"))
+        return(msg)
+    }
     ne <- !nas & (target != current)
     if(!any(ne) && is.null(msg)) TRUE
     else if(any(ne)) c(msg, paste(sum(ne), "string mismatches"))
@@ -141,76 +229,205 @@
     if(is.null(msg)) TRUE else msg
 }
 
-all.equal.list <- function(target, current, ...)
-{
-    msg <- attr.all.equal(target, current, ...)
-#    nt <- names(target)
-    nc <- names(current)
-    iseq <-
-        ## <FIXME>
-        ## Commenting this eliminates PR#674, and assumes that lists are
-        ## regarded as generic vectors, so that they are equal iff they
-        ## have identical names attributes and all components are equal.
-        ## if(length(nt) && length(nc)) {
-        ##     if(any(not.in <- (c.in.t <- match(nc, nt, 0)) == 0))
-        ## 	msg <- c(msg, paste("Components not in target:",
-        ## 			    paste(nc[not.in], collapse = ", ")))
-        ##     if(any(not.in <- match(nt, nc, 0) == 0))
-        ## 	msg <- c(msg, paste("Components not in current:",
-        ## 			    paste(nt[not.in], collapse = ", ")))
-        ##     nt[c.in.t]
-        ## } else
-        ## </FIXME>
-        if(length(target) == length(current)) {
-	    seq(along = target)
-	} else {
-	    nc <- min(length(target), length(current))
-	    msg <- c(msg, paste("Length mismatch: comparison on first",
-				nc, "components"))
-	    seq(length = nc)
-	}
-    for(i in iseq) {
-	mi <- all.equal(target[[i]], current[[i]], ...)
-	if(is.character(mi))
-	    msg <- c(msg, paste("Component ", i, ": ", mi, sep=""))
-    }
-    if(is.null(msg)) TRUE else msg
+all.equal.list <- function
+(target, current, ..., by.name="auto", by.pos=TRUE, debug.p=FALSE) {
+   # This is copied from "R-beta_2005-09-24_r35666.tar.gz" (which
+   # seems to be unchangd since at least R 2.1.0) and then
+   # modified.
+   #
+   # The stock implementation checked names (and other attributes)
+   # on the list itself but NOT on the components of the list!  We
+   # fix that below.  Furthermore, when reporting differences, it
+   # is much more helpful to report the name-indexed as well as
+   # position-indexed differences, so do that too.
+   #
+   # See also:
+   #   http://r-bugs.biostat.ku.dk/cgi-bin/R/Language-fixed?id=674
+   #   https://stat.ethz.ch/pipermail/r-devel/2000-October/thread.html#21323
+   #
+   # --atp at piskorski.com, 2005/09/27 20:32 EDT
+
+   # - by.name:  When set to "auto" (and by.pos=T), we display the
+   #   by.name=T messages if and only if the by.pos=T checks found
+   #   that one or more names differ.  This is particularly useful for
+   #   data frames, as it immediately disambiguates the "columns are
+   #   just in different orders" vs. "columns of the same name really
+   #   do have different values" cases.
+
+   msg <- attr.all.equal(target, current, ... ,debug.p=debug.p)
+
+   recurse <- function(by.which ,show.other.p) {
+      msg <- c()
+      if (by.which == "name") {
+         by.name <- TRUE ; by.pos <- FALSE
+      } else {
+         by.pos <- TRUE
+         if (by.name != "auto") by.name <- FALSE
+      }
+
+      for (i in iseq) {
+         other.str <- ""
+         mi <- all.equal(target[[i]], current[[i]], ...
+                         ,by.name=by.name ,by.pos=by.pos ,debug.p=debug.p)
+         if (is.character(mi)) {
+            names.differ.p <- F
+            if (by.which == "pos") {
+               name.c <- nc[[i]] ; name.t <- nt[[i]]
+               tmp <- (name.c == name.t)
+               if (length(name.c)==0 && length(name.t)==0) {
+                  # No names at all, show nothing.
+               } else if (length(tmp) && !is.na(tmp) && tmp) {
+                  # Names are the same, only show one:
+                  other.str <- paste(" (" ,name.c ,")" ,sep="")
+               } else {
+                  # Current and Target names differ, show both:
+                  other.str <- paste(" (" ,name.c ," / " ,name.t ,")" ,sep="")
+                  names.differ.p <- T
+               }
+            }
+            msg <- c(msg, paste("Component ", i, other.str, ": ", mi, sep=""))
+
+            if (by.which == "pos" && names.differ.p && by.name == "auto") {
+               tmp <- which(name.c == names(target))
+               if (length(tmp)==0) {
+                  ## This is redundant with the "names not in Target:"
+                  ## message we already printed out:
+                  #msg <- c(msg, paste("Component ", name.c, ": ", "Not in Target.", sep=""))
+               } else {
+                  if (length(tmp) > 1) {
+                     msg <- c(msg, paste("Warning:" ,length(tmp) ,"components in Target w/ name:" ,name.c))
+                     # The code below will check only the first named component:
+                  }
+                  mi <- all.equal(target[[name.c]] ,current[[name.c]] ,...
+                                  ,by.name=TRUE ,by.pos=FALSE ,debug.p=debug.p)
+                  if (is.character(mi))
+                     msg <- c(msg, paste("Component ", name.c, ": ", mi, sep=""))
+                  else
+                     msg <- c(msg, paste("Component ", name.c, ": ", "[same]", sep=""))
+               }
+            }
+         }
+      }
+      return(msg)
+   }
+
+   if (by.name==FALSE && !by.pos)
+      stop("Cannot have both by.pos and by.name False!")
+   if (!is.logical(by.name) && by.name != "auto")
+      stop(paste("Invalid value for by.name:" ,by.name))
+   nt <- names(target) ; nc <- names(current)
+
+   if (by.name == TRUE && length(nt) > 0 && length(nc) > 0) {
+      ## These "Components not in" messages are redundant with the
+      ## "names not in" messages we've already printed out:
+      ## --atp at piskorski.com, 2005/10/01 08:05 EDT
+      if(any(not.in <- (c.in.t <- match(nc, nt, 0)) == 0)) {
+         #msg <- c(msg, paste("Components not in target:", paste(nc[not.in], collapse = ", ")))
+      }
+      #if(any(not.in <- match(nt, nc, 0) == 0)) {
+      #   msg <- c(msg, paste("Components not in current:", paste(nt[not.in], collapse = ", ")))
+      #}
+      iseq <- nt[c.in.t]
+      msg <- c(msg ,recurse(by.which="name"))
+   }
+
+   if (by.pos) {
+      iseq <-
+         (if(length(target) == length(current)) {
+            seq(along = target)
+         } else {
+            tmp <- min(length(target), length(current))
+            msg <- c(msg, paste("Length mismatch: comparison on first",
+                                tmp, "components"))
+            seq(length = tmp)
+         })
+      msg <- c(msg ,recurse(by.which="pos"))
+   }
+
+   if(is.null(msg)) TRUE else msg
 }
 
+
 
-attr.all.equal <- function(target, current, ...)
-{
-    ##--- "all.equal(.)" for attributes ---
-    ##---  Auxiliary in all.equal(.) methods --- return NULL or character()
-    msg <- NULL
-    if(mode(target) != mode(current))
-	msg <- paste("Modes: ", mode(target), ", ", mode(current), sep = "")
-    if(length(target) != length(current))
-	msg <- c(msg, paste("Lengths: ", length(target), ", ",
-			    length(current), sep = ""))
-    ax <- attributes(target)
-    ay <- attributes(current)
-    nx <- names(target)
-    ny <- names(current)
-    if((lx <- length(nx)) | (ly <- length(ny))) {
-	## names() treated now; hence NOT with attributes()
-	ax$names <- ay$names <- NULL
-	if(lx && ly) {
-	    if(is.character(m <- all.equal.character(nx, ny)))
-		msg <- c(msg, paste("Names:", m))
-	} else if(lx)
-	    msg <- c(msg, "names for target but not for current")
-	else msg <- c(msg, "names for current but not for target")
-    }
-    if(length(ax) || length(ay)) {# some (more) attributes
-	## order by names before comparison:
-	nx <- names(ax)
-	ny <- names(ay)
-	if(length(nx))	    ax <- ax[order(nx)]
-	if(length(ny))	    ay <- ay[order(ny)]
-	tt <- all.equal(ax, ay, ...)
-	if(is.character(tt)) msg <- c(msg, paste("Attributes: <", tt, ">"))
-    }
-    msg # NULL or character
+attr.all.equal <- function(target, current, ... ,debug.p=FALSE) {
+   # Based on stock "R-beta_2005-09-24_r35666.tar.gz".  Differences are:
+   # - Also report WHICH names differ.
+   # - Do same checks on row.names and dimnames (if present) as on
+   #   names.
+   # --atp at piskorski.com, 2005/10/01 01:16 EDT
+
+   ##--- "all.equal(.)" for attributes ---
+   ##---  Auxiliary in all.equal(.) methods --- return NULL or character()
+   msg <- NULL
+   if(mode(target) != mode(current))
+      msg <- paste("Modes: ", mode(target), ", ", mode(current), sep = "")
+   if(length(target) != length(current))
+      msg <- c(msg, paste("Lengths: ", length(target), ", ",
+                          length(current), sep = ""))
+   ax <- attributes(target)
+   ay <- attributes(current)
+
+   local.compare.names <- function() {
+      msg <- c()
+      if(lx && ly) {
+         if(is.character(m <- all.equal.character(nx, ny ,debug.p=debug.p))) {
+            msg <- c(msg, paste("Names:", m))
+            not.in <- setdiff(ny ,nx)
+            if (length(not.in) > 0)
+               msg <- c(msg, paste(length(not.in) ,name.type.pretty
+                                   ,"not in Target:"
+                                   ,paste(not.in,collapse=", ")))
+            not.in <- setdiff(nx ,ny)
+            if (length(not.in) > 0)
+               msg <- c(msg, paste(length(not.in) ,name.type.pretty
+                                   ,"not in Current:"
+                                   ,paste(not.in,collapse=", ")))
+         }
+      } else if(lx) {
+         msg <- c(msg ,name.type.pretty ,"for Target but not for Current")
+      } else { msg <- c(msg ,name.type.pretty ,"for Current but not for Target") }
+      return(msg)
+   }
+
+   nx <- names(target) ; ny <- names(current)
+   if((lx <- length(nx)) | (ly <- length(ny))) {
+      ## names() treated now; hence NOT with attributes()
+      ax$names <- ay$names <- NULL
+      name.type.pretty <- "names"
+      msg <- c(msg ,local.compare.names())
+   }
+   if (any(names(ax) == "row.names") && any(names(ay) == "row.names")) {
+      nx <- row.names(target) ; ny <- row.names(current)
+      if((lx <- length(nx)) | (ly <- length(ny))) {
+         ## row.names() treated now; hence NOT with attributes():
+         ax$row.names <- ay$row.names <- NULL
+         name.type.pretty <- "row.names"
+         msg <- c(msg ,local.compare.names())
+      }
+   }
+   if (any(names(ax) == "dimnames") && any(names(ay) == "dimnames")) {
+      # We destructively remove dimnames, so loop from highest to lowest:
+      for (dim.i in length(dimnames(target)):1) {
+         nx <- dimnames(target)[[dim.i]] ; ny <- dimnames(current)[[dim.i]]
+         if((lx <- length(nx)) | (ly <- length(ny))) {
+            ## dimnames()[[dim.i]] treated now; hence NOT with attributes():
+            ax$dimnames[[dim.i]] <- ay$dimnames[[dim.i]] <- NULL
+            name.type.pretty <- paste("dimnames[[" ,dim.i ,"]]" ,sep="")
+            msg <- c(msg ,local.compare.names())
+         }
+      }
+   }
+
+   if(length(ax) || length(ay)) {# some (more) attributes
+      ## order by names before comparison:
+      nx <- names(ax)
+      ny <- names(ay)
+      if(length(nx))	    ax <- ax[order(nx)]
+      if(length(ny))	    ay <- ay[order(ny)]
+      tt <- all.equal(ax, ay, ... ,debug.p=debug.p)
+      if(is.character(tt)) msg <- c(msg, paste("Attributes: <", tt, ">"))
+   }
+
+   msg # NULL or character
 }
 

--k1lZvvs/B4yU6o8G--