Skip to content

spss long labels

2 messages · Kurt Van Dijck, Martin Maechler

#
On Tue, Jul 15, 2008 at 09:29:22AM +0100, Prof Brian Ripley wrote:
I had problems with whitespace in the patch file,
I attached a new one
I ran the spss.R in tests/, it worked fine. Be sure to clean all object
files before compiling.

Ilse made me a test .sav file (attached) with 2 variables (varialbe1 &
      variable2), 3 records.
This piece of R code shows the problem:

# to resolve locale problem
Sys.setlocale (locale="C");
# read spss datafile
library(foreign);
data = read.spss("spss_long.sav", to.data.frame=TRUE);
# to.data.frame not necessary, but gives nicer output
# commands to show the data, the variable names and labels
data;
names(data);
attr(data, "variable.labels");

# result in unpatched version:
# both variable names are in shortened form 
# (max 8 characters; provided in SPSS-file)

#> data;
#  VARIABLE V2_A
#1        1    1
#2        2    1
#3        2    3
#
#> names(data);
#[1] "VARIABLE" "V2_A"    
#
#> attr(data,"variable.labels");
#   VARIABLE        V2_A 
#"variable1" "variable2" 

# and in patched version:
# variable names are the full names as originally defined in the SPSS-file

#> data;
#  variable1 variable2
#1         1         1
#2         2         1
#3         2         3
#> names(data);
#[1] "variable1" "variable2"
#> attr(data, "variable.labels");
#  variable1   variable2 
#"variable1" "variable2" 

Kind regards,
Kurt & Ilse

Index: src/sfm-read.c
===================================================================
--- src/sfm-read.c	(revision 5175)
+++ src/sfm-read.c	(working copy)
@@ -188,6 +188,8 @@
 static int read_variables (struct file_handle * h, struct variable *** var_by_index);
 static int read_machine_int32_info (struct file_handle * h, int size, int count, int *encoding);
 static int read_machine_flt64_info (struct file_handle * h, int size, int count);
+static int read_long_var_names (struct file_handle * h, struct dictionary *
+		, unsigned long size, unsigned int count);
 static int read_documents (struct file_handle * h);
 
 /* Displays the message X with corrupt_msg, then jumps to the lossage
@@ -418,11 +420,15 @@
 		break;
 
 	      case 7: /* Multiple-response sets (later versions of SPSS). */
-	      case 13:  /* long variable names. PSPP now has code for these
-			   that could be ported if someone is interested. */
 		skip = 1;
 		break;
 
+	      case 13:	/* long variable names. PSPP now has code for these
+			   that could be ported if someone is interested. */
+		if (!read_long_var_names(h, ext->dict, data.size, data.count))
+		  goto lossage;
+		break;
+
 	      case 16: /* See http://www.nabble.com/problem-loading-SPSS-15.0-save-files-t2726500.html */
 		skip = 1;
 		break;
@@ -584,14 +590,72 @@
   return 0;
 }
 
+/* Read record type 7, subtype 13.
+ * long variable names
+ */
 static int
+read_long_var_names (struct file_handle * h, struct dictionary * dict
+		, unsigned long size, unsigned int count)
+{
+  char * data;
+  unsigned int j;
+  struct variable ** lp;
+  struct variable ** end;
+  char * p;
+  char * endp;
+  char * val;
+  if ((1 != size)||(0 == count)) {
+    warning("%s: strange record info seen, size=%u, count=%u"
+      ", ignoring long variable names"
+      , h->fn, size, count);
+    return 0;
+  }
+  size *= count;
+  data = Calloc (size +1, char);
+  bufread(h, data, size, 0);
+  /* parse */
+  end = &dict->var[dict->nvar];
+  p = data;
+  do {
+    if (0 != (endp = strchr(p, '\t')))
+      *endp = 0; /* put null terminator */
+    if (0 == (val = strchr(p, '='))) {
+      warning("%s: no long variable name for variable '%s'", h->fn, p);
+    } else {
+      *val = 0;
+      ++val;
+      /* now, p is key, val is long name */
+      for (lp = dict->var; lp < end; ++lp) {
+        if (!strcmp(lp[0]->name, p)) {
+          strncpy(lp[0]->name, val, sizeof(lp[0]->name));
+          break;
+        }
+      }
+      if (lp >= end) {
+        warning("%s: long variable name mapping '%s' to '%s'"
+        "for variable which does not exist"
+        , h->fn, p, val);
+      }
+    }
+    p = &endp[1]; /* put to next */
+  } while (endp);
+
+  free(data);
+  return 1;
+
+lossage:
+  free(data);
+  return 0;
+}
+
+static int
 read_header (struct file_handle * h, struct sfm_read_info * inf)
 {
   struct sfm_fhuser_ext *ext = h->ext;	/* File extension strcut. */
   struct sysfile_header hdr;		/* Disk buffer. */
   struct dictionary *dict;		/* File dictionary. */
   char prod_name[sizeof hdr.prod_name + 1];	/* Buffer for product name. */
-  int skip_amt = 0;		        /* Amount of product name to omit. */
+  int skip_amt = 0;			/* Amount of product name to omit. */
   int i;
 
   /* Create the dictionary. */
@@ -1495,7 +1559,7 @@
 /* Reads one case from system file H into the value array PERM
    according to the instructions given in associated dictionary DICT,
    which must have the get.* elements appropriately set.  Returns
-   nonzero only if successful.  */
+   nonzero only if successful.	*/
 int
 sfm_read_case (struct file_handle * h, union value * perm, struct dictionary * dict)
 {
Index: src/var.h.in
===================================================================
--- src/var.h.in	(revision 5175)
+++ src/var.h.in	(working copy)
@@ -41,6 +41,10 @@
 #error MAX_SHORT_STRING must be less than 8.
 #endif
 
+/* VAR_NAME_LEN: the length of a variable.
+ * SPSS supports names of 64 long
+ */
+#define VAR_NAME_LEN 64
 /* Special values. */
 #define SYSMIS (-DBL_MAX)
 #define LOWEST second_lowest_double_val()
@@ -228,7 +232,7 @@
 /* MODIFY VARS private data. */
 struct modify_vars_proc
   {
-    char new_name[9];		/* Variable's new name. */
+    char new_name[VAR_NAME_LEN +1];		/* Variable's new name. */
     int drop_this_var;		/* 0=keep this var, 1=drop this var. */
     struct variable *next;	/* Next in linked list. */
   };
@@ -302,7 +306,7 @@
 struct variable
   {
     /* Required by parse_variables() to be in this order.  */
-    char name[9];		/* As a string. */
+    char name[VAR_NAME_LEN +1];	/* As a string. */
     int index;			/* Index into its dictionary's var[]. */
     int type;			/* NUMERIC or ALPHA. */
     int foo;			/* Used for temporary storage. */
@@ -373,9 +377,9 @@
 
     int weight_index;		/* `value' index of $WEIGHT, or -1 if none.
 				   Call update_weighting() before using! */
-    char weight_var[9];		/* Name of WEIGHT variable. */
+    char weight_var[VAR_NAME_LEN];/* Name of WEIGHT variable. */
 
-    char filter_var[9];		/* Name of FILTER variable. */
+    char filter_var[VAR_NAME_LEN];/* Name of FILTER variable. */
     /* Do not make another field the last field! or see
        temporary.c:restore_dictionary() before doing so! */
   };
#
Hi Kurt,

I have committed your patch and your new spss_long.sav
example file, tested things and uploaded a new version of
'foreign', namely 0.8-27 to CRAN.

I vaguely remember that other useRs have asked for long-label
support in the past, so if you are (or know of) such a user,
we'd be glad if you'd test the new foreign package, once it will
be available for update.packages().

Martin Maechler