Skip to content

Help with Data Transformation

8 messages · Guy Jett, John Kane, ONKELINX, Thierry +2 more

#
That sample data set is really hard to read.  Could you resent it after having used dput on it?  

A file output with dput is easily read into R and makes seeing what you need much easier.  BTW what are the = doing?

Thanks
--- On Mon, 1/10/11, Guy Jett <GJett at itsi.com> wrote:

            
#
Thank you John,
I have put a copy of the dput file below my signature block.  I hope that is what you need as I am unfamiliar with that function.  Note that empty cells need to become "NA".
The "=" character is part of the "PARVALUE" column.
Yours,
Guy
gjett at itsi.com

structure(list(X = c(2268L, 2269L, 2270L, 2272L, 2273L, 2274L, 
2275L, 2276L, 2289L, 2290L, 2291L, 2292L, 2293L, 2294L, 2295L, 
2296L, 2326L, 2327L, 2328L, 2329L, 2330L, 2331L, 2332L, 2346L, 
2349L, NA, NA, NA, NA, NA), fldsampid = structure(c(3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 2L, 3L, 4L), .Label = c("", "fldsampid", 
"LHR020GW-01E2", "LHR020SD-00E2"), class = "factor"), CLP_ID = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 2L, 1L, 3L), .Label = c("", 
"CLP_ID", "MY77J8"), class = "factor"), sacode = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 3L, 2L, 2L), .Label = c("", 
"N", "sacode"), class = "factor"), matrix = structure(c(4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 2L, 4L, 3L), .Label = c("", 
"matrix", "SE", "WG"), class = "factor"), etc. = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), .Label = c("", 
"etc."), class = "factor"), prccode = structure(c(4L, 4L, 4L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 8L, 5L, 6L, 7L, 
5L, 5L, 5L, 5L, 5L, 6L, 1L, 1L, 3L, 2L, NA), .Label = c("", "<value>", 
"CL", "INO", "MET", "MI", "ORG", "SN"), class = "factor"), Lab = structure(c(4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 5L, 2L, NA), .Label = c("", 
"<value>", "A4SW", "BRLS", "PO4"), class = "factor"), EXMCODE = structure(c(5L, 
5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 5L, 5L, 5L, 5L, 4L, 
4L, 5L, 4L, 4L, 4L, 4L, 7L, 4L, 1L, 1L, 6L, 2L, NA), .Label = c("", 
"<value>", "FLDFLT", "METHOD", "NONE", "SO4", "SW3050B"), class = "factor"), 
    Analysis = structure(c(13L, 13L, 13L, 10L, 11L, 11L, 11L, 
    11L, 11L, 11L, 11L, 3L, 3L, 3L, 5L, 13L, 9L, 8L, 15L, 7L, 
    12L, 12L, 12L, 14L, 4L, 1L, 1L, 6L, 2L, 2L), .Label = c("", 
    "<value>", "A2320", "A2540G", "A5310B", "AG", "C245.5", "E160.3", 
    "E1630", "E1631", "E1638", "E200.8", "E300", "SW6010B", "SW9060"
    ), class = "factor"), PARLABEL = structure(c(10L, 16L, 17L, 
    12L, 3L, 4L, 8L, 9L, 20L, 21L, 15L, 5L, 6L, 7L, 11L, 14L, 
    13L, 18L, 19L, 12L, 3L, 8L, 9L, 20L, 18L, 1L, 1L, 4L, 2L, 
    2L), .Label = c("", "<value>", "AG", "AL", "ALK", "ALKB", 
    "ALKC", "AS", "B", "CL", "DOC", "HG", "MEHG", "NO3", "PB", 
    "PO4", "SO4", "SOLID", "TOC", "V", "Zn"), class = "factor"), 
    PARVQ = structure(c(3L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    5L, 3L, 3L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 5L, 3L, 3L, 3L, 3L, 
    1L, 1L, 4L, 2L, 2L), .Label = c("", "<value>", "=", "AS", 
    "ND"), class = "factor"), Result = structure(c(12L, 20L, 
    11L, 3L, 10L, 8L, 14L, 26L, 9L, 5L, 4L, 25L, 25L, 13L, 19L, 
    24L, 15L, 18L, 16L, 21L, 6L, 22L, 7L, 23L, 17L, 1L, 1L, 27L, 
    2L, 2L), .Label = c("", "<value>", "0.00171", "0.008", "1.76", 
    "1050", "11400", "122", "131", "2.57", "22460", "23590.9", 
    "2500", "317", "4.28", "4.823", "47.7", "48.45", "49330", 
    "50", "5100", "5500", "56900", "792", "807000", "9970", "B"
    ), class = "factor"), X.1 = structure(c(1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, 2L), .Label = c("", "<value>", 
    "V"), class = "factor"), X.2 = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "Zn"), class = "factor"), X.3 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, 2L), .Label = c("", 
    "<value>", "etc."), class = "factor"), X.4 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "ALK"), class = "factor"), X.5 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "ALKB"), class = "factor"), X.6 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "ALKC"), class = "factor"), X.7 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, 2L), .Label = c("", 
    "<value>", "SOLID"), class = "factor"), X.8 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "DOC"), class = "factor"), X.9 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "TOC"), class = "factor"), X.10 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, NA), .Label = c("", 
    "<value>", "NO3"), class = "factor")), .Names = c("X", "fldsampid", 
"CLP_ID", "sacode", "matrix", "etc.", "prccode", "Lab", "EXMCODE", 
"Analysis", "PARLABEL", "PARVQ", "Result", "X.1", "X.2", "X.3", 
"X.4", "X.5", "X.6", "X.7", "X.8", "X.9", "X.10"), class = "data.frame", row.names = c(NA, 
-30L))


-----Original Message-----
From: John Kane [mailto:jrkrideau at yahoo.ca] 
Sent: Monday, January 10, 2011 1:43 PM
To: r-help at r-project.org; Guy Jett
Subject: Re: [R] Help with Data Transformation

That sample data set is really hard to read.  Could you resent it after having used dput on it?  

A file output with dput is easily read into R and makes seeing what you need much easier.  BTW what are the = doing?

Thanks
--- On Mon, 1/10/11, Guy Jett <GJett at itsi.com> wrote:

            
#
That's fine.  Am I correct that this is the format you want for the output file? 
 
nams <- c("fldsampid", "CLP_ID", "sacode", "matrix", "etc.", "CL",
"PO4", "SO4AG", "AL", "AS", "B", "V", "Zn", "etc.", "ALK",
"ALKB", "ALKC", "SOLID", "DOC", "TOC", "NO3")


It seems a bit suspicious as you have etc. in there twice.


In any case if I understand what you want all you need to do is arrange those names in the order you want and put them in a vector call it bb.

Then you simply say

newxx <- xx[,bb]
et voil?I

You may want to have a look at indexing in the Introduction to R to get a feeling for what's happening herel

Below is a small example.

However I don't think your data.frame is quite what you think it is.

When I do str(xx) to look at the structure all your variables are being read in as factors, which I suspect is not what you want.  R tries to recognize what type of variable is being read in and often seems to decide a character or even a numeric variable is a factor

You may want to run the command 
options(stringsAsFactors = FALSE)
before you load the data into the data.frame

I hope this is of some help.
#===================================================================
df1 <- structure(list(site = c(1, 1, 4, 4, 1, 4), id = structure(c(1L,
2L, 2L, 3L, 1L, 2L), .Label = c("a", "b", "c"), class = "factor"),
    cata = c(1, 1, 6, 1, 1, NA), catb = c(1, 2, 3, 4, 5, 6),
    doga = c(3, 5, 3, 6, 4, 0), dogb = c(2, 4, 6, 8, 10, 12),
    rata = c(NA, 9, 9, 8, 9, 8), ratb = c(1, 2, 3, 4, 5, 6),
    bata = c(12, 42, NA, 45, 32, 54), batb = c(13, 15, 17, 19,
    21, 23)), .Names = c("site", "id", "cata", "catb", "doga",
"dogb", "rata", "ratb", "bata", "batb"), row.names = c("aa",
"bb", "cc", "dd", "ee", "ff"), class = "data.frame")

df1

bb <- c("dogb", "rata", "ratb", "bata", "batb", "site", "id", "cata",
    "catb", "doga")
    
    newdf <- df1[,bb]
    
 
#==================================================================
--- On Mon, 1/10/11, Guy Jett <GJett at itsi.com> wrote:

            
#
Dear Guy,

Have a look at cast() from the reshape package. You'll need something
like

cast(fldsampleid ~ Analysis, value = "Result", data = your.data.frame)

Best regards,

Thierry

------------------------------------------------------------------------
----
ir. Thierry Onkelinx
Instituut voor natuur- en bosonderzoek
team Biometrie & Kwaliteitszorg
Gaverstraat 4
9500 Geraardsbergen
Belgium

Research Institute for Nature and Forest
team Biometrics & Quality Assurance
Gaverstraat 4
9500 Geraardsbergen
Belgium

tel. + 32 54/436 185
Thierry.Onkelinx at inbo.be
www.inbo.be

To call in the statistician after the experiment is done may be no more
than asking him to perform a post-mortem examination: he may be able to
say what the experiment died of.
~ Sir Ronald Aylmer Fisher

The plural of anecdote is not data.
~ Roger Brinner

The combination of some data and an aching desire for an answer does not
ensure that a reasonable answer can be extracted from a given body of
data.
~ John Tukey
#
You might also want to look at the SQL packages for R so you can skip
this manual step. I'd recommend starting with
http://cran.r-project.org/doc/manuals/R-data.html#Relational-databases

Hadley
1 day later
#
Hi John,
Thank you for your patience.  I was away for a State certification exam yesterday, so am just getting back to this.  

Reading through you response I believe I wasn't clear enough about what I'm trying to do.  Your description seems to rearrange the matrix without grouping the analytical results for a single sample onto a single line, as I had hoped.  I may have confused things by attempting to send a truncated/simplified dataset.  

Restatement of needs:
* I have 863 individual samples.  The following columns contain invariant results for each sample:
    - "Transect","Offset","Location","fldsampid","CLP_ID","sacode","matrix","LTCCODE",
	"Northing","Easting","CRDUNITS","Event","LOGDATE","sbd","sed".
    - Sorting can make use of "fldsampid" as these values are entirely unique to 
      each sample.
* Each sample is associates with one or more of the following 48 analytical parameters:
    - AG","AL","ALK","ALKB","ALKC","AS","B","BA","BE","BR","CA","CD""CL","CO","CR","CU",
      "DOC","FE","Hg","HG","HGACIDLAB","HGEXTINO","HGEXTORG","HGNONMOB""HGSEMIMOB","K",
      "MEHG","MG","MN","MO","NH3N","NI","NO2N","NO3","NO3N","PB""PO4","S","SB","SE","SO4",
	"SOLID","SSC","TL","TOC","V","Zn","ZN"
    - These are currently stored in the "PARLABEL" column.
* For each sample ID I would like to create a single line;
  - Extract each "PARLABEL" to use as a column name; and
  - Place the "Result" in the appropriate column.
* I can subset the data so that "prccode", "Lab", "EXMCODE", "Analysis", "PARVQ", "RL",
  "EPA_FLAGS", and "units" are irrelevant to the issue.

The following snippet should illustrate the absolute minumum needs:
INPUT
fldsampid   |  PARLABEL    |  Result
------------+--------------+---------
fldsampid1  |  PARLABEL-a  |  value-8
fldsampid1  |  PARLABEL-b  |  value-5
fldsampid1  |  PARLABEL-x  |  value-2
fldsampid1  |  PARLABEL-y  |  value-0
fldsampid2  |  PARLABEL-a  |  value-9
fldsampid2  |  PARLABEL-c  |  value-8
fldsampid3  |  PARLABEL-a  |  value-2
fldsampid3  |  PARLABEL-d  |  value-8
fldsampid3  |  PARLABEL-w  |  value-3
fldsampid3  |  PARLABEL-x  |  value-9
fldsampid3  |  PARLABEL-y  |  value-6

OUTPUT
fldsampid   |  PARLABEL-a  |  PARLABEL-b  |  PARLABEL-w  |  PARLABEL-x  |  PARLABEL-y  
------------+--------------+--------------+--------------+--------------+--------------
fldsampid1  |   value-8    |   value-5    |   NA         |   value-2    |   value-0    
fldsampid2  |   value-9    |   value-2    |   NA         |   NA         |   NA         
fldsampid3  |   value-2    |   NA         |   value-3    |   value-9    |  value-6     

If it would help I could attach a 31kb file written with 
	write.table(Units_NG.L, file="Units_NG.L", quote=FALSE, sep="\t")
This subset has 97 individual samples and 3 "PARLABELS" distributed across 249 individual lines.


Added Responses:
1. The structure of my actual input file appears to be correct per the following:
   (I has sent you a separate extration from an excel file)
   (Strings as Factors, numbers as num or int; a date changed via as.Date())
'data.frame':   19694 obs. of  25 variables:
 $ Transect : Factor w/ 78 levels "FLR01","FLR02",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Offset   : Factor w/ 16 levels "0","A","B","C",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Location : Factor w/ 246 levels "FLR010","FLR01A",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ fldsampid: Factor w/ 863 levels "FLR010-ANE1",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ CLP_ID   : Factor w/ 586 levels "","MY6591","MY6593",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ sacode   : Factor w/ 2 levels "FD","N": 2 2 2 2 2 2 2 2 2 2 ...
 $ matrix   : Factor w/ 6 levels "SE","SO","TA",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ LTCCODE  : Factor w/ 4 levels "BH","LK","RE",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ Northing : num  2444733 2444733 2444733 2444733 2444733 ...
 $ Easting  : num  5684613 5684613 5684613 5684613 5684613 ...
 $ CRDUNITS : Factor w/ 1 level "FT": 1 1 1 1 1 1 1 1 1 1 ...
 $ Event    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ LOGDATE  :Class 'Date'  num [1:19694] -717743 -717743 -717743 -717743 -717743 ...
 $ sbd      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ sed      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ prccode  : Factor w/ 5 levels "INO","MET","MI",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ Lab      : Factor w/ 5 levels "A4SW","BRLS",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ EXMCODE  : Factor w/ 5 levels "FLDFLT","METHOD",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ Analysis : Factor w/ 23 levels "A2320","A2540G",..: 10 11 12 12 12 12 12 12 12 12 ...
 $ PARLABEL : Factor w/ 48 levels "AG","AL","ALK",..: 27 20 1 2 6 7 8 9 12 14 ...
 $ PARVQ    : Factor w/ 3 levels "=","ND","TR": 1 1 2 1 1 1 1 2 1 1 ...
 $ Result   : num  20.6 24.7 5 14900 60 100 4930 4 182 80 ...
 $ RL       : num  3.1 0.77 10 5750 160 790 160 8 10 80 ...
 $ EPA_FLAGS: Factor w/ 10 levels "","J","J-","J+",..: 4 1 7 3 2 2 1 7 1 2 ...
 $ units    : Factor w/ 3 levels "ug/kg","ug/L",..: 1 1 1 1 1 1 1 1 1 1 ...

2. "etc..." Sorry to confuse you, this was to indicate additional columns.

Guy Jett
ITSI,? A Gilbane Company
(925) 946-3340 Direct
(925) 457-4168 ITSI Cell
gjett at itsi.com

-----Original Message-----
From: John Kane [mailto:jrkrideau at yahoo.ca] 
Sent: Monday, January 10, 2011 4:29 PM
To: r-help at r-project.org; Guy Jett
Subject: RE: [R] Help with Data Transformation

That's fine.  Am I correct that this is the format you want for the output file? 
 
nams <- c("fldsampid", "CLP_ID", "sacode", "matrix", "etc.", "CL", "PO4", "SO4AG", "AL", "AS", "B", "V", "Zn", "etc.", "ALK", "ALKB", "ALKC", "SOLID", "DOC", "TOC", "NO3")


It seems a bit suspicious as you have etc. in there twice.


In any case if I understand what you want all you need to do is arrange those names in the order you want and put them in a vector call it bb.

Then you simply say

newxx <- xx[,bb]
et voil?I

You may want to have a look at indexing in the Introduction to R to get a feeling for what's happening herel

Below is a small example.

However I don't think your data.frame is quite what you think it is.

When I do str(xx) to look at the structure all your variables are being read in as factors, which I suspect is not what you want.  R tries to recognize what type of variable is being read in and often seems to decide a character or even a numeric variable is a factor

You may want to run the command
options(stringsAsFactors = FALSE)
before you load the data into the data.frame

I hope this is of some help.
#===================================================================
df1 <- structure(list(site = c(1, 1, 4, 4, 1, 4), id = structure(c(1L, 2L, 2L, 3L, 1L, 2L), .Label = c("a", "b", "c"), class = "factor"),
    cata = c(1, 1, 6, 1, 1, NA), catb = c(1, 2, 3, 4, 5, 6),
    doga = c(3, 5, 3, 6, 4, 0), dogb = c(2, 4, 6, 8, 10, 12),
    rata = c(NA, 9, 9, 8, 9, 8), ratb = c(1, 2, 3, 4, 5, 6),
    bata = c(12, 42, NA, 45, 32, 54), batb = c(13, 15, 17, 19,
    21, 23)), .Names = c("site", "id", "cata", "catb", "doga", "dogb", "rata", "ratb", "bata", "batb"), row.names = c("aa", "bb", "cc", "dd", "ee", "ff"), class = "data.frame")

df1

bb <- c("dogb", "rata", "ratb", "bata", "batb", "site", "id", "cata",
    "catb", "doga")
    
    newdf <- df1[,bb]
    
 
#==================================================================
--- On Mon, 1/10/11, Guy Jett <GJett at itsi.com> wrote: