Skip to content

Change values in a dateframe

10 messages · Arnaud Michel, Berend Hasselman, arun

#
Hello

I have the following problem :
The dataframe TEST has multiple lines for a same person because :
there are differents values of Nom or differents values of Prenom
but the values of Matricule or Sexe or Date.de.naissance are the same.

TEST <- structure(list(Matricule = c(66L, 67L, 67L, 68L, 89L, 90L, 90L,
91L, 108L, 108L, 108L), Nom = structure(c(1L, 2L, 2L, 4L, 8L,
5L, 6L, 9L, 3L, 3L, 7L), .Label = c("CHICHE", "GEOF", "GUTIER",
"JACQUE", "LANGUE", "LANGUE-LOPEZ", "RIVIER", "TRU", "VINCENT"
), class = "factor"), Prenom = structure(c(8L, 3L, 4L, 5L, 1L,
2L, 2L, 9L, 6L, 7L, 7L), .Label = c("Edgar", "Elodie", "Jeanine",
"Jeannine", "Michel", "Michele", "Mich?le", "Michelle", "Victor"
), class = "factor"), Sexe = structure(c(1L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 1L, 1L, 1L), .Label = c("F?minin", "Masculin"), class = 
"factor"),
     Date.de.naissance = structure(c(4L, 2L, 2L, 7L, 6L, 5L, 5L,
     1L, 3L, 3L, 3L), .Label = c("03/09/1940", "04/03/1946", "07/12/1947",
     "18/11/1945", "27/09/1947", "29/12/1936", "30/03/1935"), class = 
"factor")), .Names = c("Matricule",
"Nom", "Prenom", "Sexe", "Date.de.naissance"), class = "data.frame", 
row.names = c(NA,
-11L))


I would want to make homogeneous the information and would like built 2 
dataframes :
df1 wich has the value of Nom and Prenom of the first lines of TEST when 
there are different values. The other values (Matricule or Sexe or 
Date.de.naissance) are unchanged

df1 <- structure(list(Matricule = c(66L, 67L, 67L, 68L, 89L, 90L, 90L,
91L, 108L, 108L, 108L), Nom = structure(c(1L, 2L, 2L, 4L, 6L,
5L, 5L, 7L, 3L, 3L, 3L), .Label = c("CHICHE", "GEOF", "GUTIER",
"JACQUE", "LANGUE", "TRU", "VINCENT"), class = "factor"), Prenom = 
structure(c(6L,
3L, 3L, 4L, 1L, 2L, 2L, 7L, 5L, 5L, 5L), .Label = c("Edgar",
"Elodie", "Jeanine", "Michel", "Michele", "Michelle", "Victor"
), class = "factor"), Sexe = structure(c(1L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 1L, 1L, 1L), .Label = c("F?minin", "Masculin"), class = 
"factor"),
     Date.de.naissance = structure(c(4L, 2L, 2L, 7L, 6L, 5L, 5L,
     1L, 3L, 3L, 3L), .Label = c("03/09/1940", "04/03/1946", "07/12/1947",
     "18/11/1945", "27/09/1947", "29/12/1936", "30/03/1935"), class = 
"factor")), .Names = c("Matricule",
"Nom", "Prenom", "Sexe", "Date.de.naissance"), class = "data.frame", 
row.names = c(NA,
-11L))

df2 wich has the value of Nom and Prenom of the last lines of TEST when 
there are different values. The other values (Matricule or Sexe or 
Date.de.naissance) are unchanged.

df2 <- structure(list(Matricule = c(66L, 67L, 67L, 68L, 89L, 90L, 90L,
91L, 108L, 108L, 108L), Nom = structure(c(1L, 2L, 2L, 3L, 6L,
4L, 4L, 7L, 5L, 5L, 5L), .Label = c("CHICHE", "GEOF", "JACQUE",
"LANGUE-LOPEZ", "RIVIER", "TRU", "VINCENT"), class = "factor"),
     Prenom = structure(c(6L, 3L, 3L, 4L, 1L, 2L, 2L, 7L, 5L,
     5L, 5L), .Label = c("Edgar", "Elodie", "Jeannine", "Michel",
     "Mich?le", "Michelle", "Victor"), class = "factor"), Sexe = 
structure(c(1L,
     1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("F?minin",
     "Masculin"), class = "factor"), Date.de.naissance = structure(c(4L,
     2L, 2L, 7L, 6L, 5L, 5L, 1L, 3L, 3L, 3L), .Label = c("03/09/1940",
     "04/03/1946", "07/12/1947", "18/11/1945", "27/09/1947", "29/12/1936",
     "30/03/1935"), class = "factor")), .Names = c("Matricule",
"Nom", "Prenom", "Sexe", "Date.de.naissance"), class = "data.frame", 
row.names = c(NA,
-11L))

Thank for your helps
Michel
#
On 24-07-2013, at 08:39, Arnaud Michel <michel.arnaud at cirad.fr> wrote:

            
Something like this

r1 <- droplevels(do.call(rbind,lapply(split(TEST,TEST$Matricule),
                    FUN=function(x) {x[,c("Nom","Prenom")] <- x[1,c("Nom","Prenom"),drop=TRUE];x})))
rownames(r1) <- NULL 
r1

r2 <- droplevels(do.call(rbind,lapply(split(TEST,TEST$Matricule),
                    FUN=function(x) {x[,c("Nom","Prenom")] <- x[nrow(x),c("Nom","Prenom"),drop=TRUE];x})))
rownames(r2) <- NULL
r2

#> identical(r1,df1)
#[1] TRUE
#> identical(r2,df2)
#[1] TRUE

Note: I had to change the Prenom and Sexe columns because of encoding issues. but that shouldn't have any influence on the above.

Berend
#
Thank you Berend
It is exactly what I wanted.
Michel
Le 24/07/2013 09:48, Berend Hasselman a ?crit :

  
    
#
Hi Michel,
You could try:


df1New<-droplevels(TEST[with(TEST,ave(seq_along(Matricule),Matricule,FUN=min)),])
row.names(df1New)<-1:nrow(df1New)
df2New<-droplevels(TEST[with(TEST,ave(seq_along(Matricule),Matricule,FUN=max)),])
row.names(df2New)<-1:nrow(df2New)
?identical(df1New,df1)
#[1] TRUE
?identical(df2New,df2)
#[1] TRUE
A.K.



----- Original Message -----
From: Arnaud Michel <michel.arnaud at cirad.fr>
To: R help <r-help at r-project.org>
Cc: 
Sent: Wednesday, July 24, 2013 2:39 AM
Subject: [R] Change values in a dateframe

Hello

I have the following problem :
The dataframe TEST has multiple lines for a same person because :
there are differents values of Nom or differents values of Prenom
but the values of Matricule or Sexe or Date.de.naissance are the same.

TEST <- structure(list(Matricule = c(66L, 67L, 67L, 68L, 89L, 90L, 90L,
91L, 108L, 108L, 108L), Nom = structure(c(1L, 2L, 2L, 4L, 8L,
5L, 6L, 9L, 3L, 3L, 7L), .Label = c("CHICHE", "GEOF", "GUTIER",
"JACQUE", "LANGUE", "LANGUE-LOPEZ", "RIVIER", "TRU", "VINCENT"
), class = "factor"), Prenom = structure(c(8L, 3L, 4L, 5L, 1L,
2L, 2L, 9L, 6L, 7L, 7L), .Label = c("Edgar", "Elodie", "Jeanine",
"Jeannine", "Michel", "Michele", "Mich?le", "Michelle", "Victor"
), class = "factor"), Sexe = structure(c(1L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 1L, 1L, 1L), .Label = c("F?minin", "Masculin"), class = 
"factor"),
? ?  Date.de.naissance = structure(c(4L, 2L, 2L, 7L, 6L, 5L, 5L,
? ?  1L, 3L, 3L, 3L), .Label = c("03/09/1940", "04/03/1946", "07/12/1947",
? ?  "18/11/1945", "27/09/1947", "29/12/1936", "30/03/1935"), class = 
"factor")), .Names = c("Matricule",
"Nom", "Prenom", "Sexe", "Date.de.naissance"), class = "data.frame", 
row.names = c(NA,
-11L))


I would want to make homogeneous the information and would like built 2 
dataframes :
df1 wich has the value of Nom and Prenom of the first lines of TEST when 
there are different values. The other values (Matricule or Sexe or 
Date.de.naissance) are unchanged

df1 <- structure(list(Matricule = c(66L, 67L, 67L, 68L, 89L, 90L, 90L,
91L, 108L, 108L, 108L), Nom = structure(c(1L, 2L, 2L, 4L, 6L,
5L, 5L, 7L, 3L, 3L, 3L), .Label = c("CHICHE", "GEOF", "GUTIER",
"JACQUE", "LANGUE", "TRU", "VINCENT"), class = "factor"), Prenom = 
structure(c(6L,
3L, 3L, 4L, 1L, 2L, 2L, 7L, 5L, 5L, 5L), .Label = c("Edgar",
"Elodie", "Jeanine", "Michel", "Michele", "Michelle", "Victor"
), class = "factor"), Sexe = structure(c(1L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 1L, 1L, 1L), .Label = c("F?minin", "Masculin"), class = 
"factor"),
? ?  Date.de.naissance = structure(c(4L, 2L, 2L, 7L, 6L, 5L, 5L,
? ?  1L, 3L, 3L, 3L), .Label = c("03/09/1940", "04/03/1946", "07/12/1947",
? ?  "18/11/1945", "27/09/1947", "29/12/1936", "30/03/1935"), class = 
"factor")), .Names = c("Matricule",
"Nom", "Prenom", "Sexe", "Date.de.naissance"), class = "data.frame", 
row.names = c(NA,
-11L))

df2 wich has the value of Nom and Prenom of the last lines of TEST when 
there are different values. The other values (Matricule or Sexe or 
Date.de.naissance) are unchanged.

df2 <- structure(list(Matricule = c(66L, 67L, 67L, 68L, 89L, 90L, 90L,
91L, 108L, 108L, 108L), Nom = structure(c(1L, 2L, 2L, 3L, 6L,
4L, 4L, 7L, 5L, 5L, 5L), .Label = c("CHICHE", "GEOF", "JACQUE",
"LANGUE-LOPEZ", "RIVIER", "TRU", "VINCENT"), class = "factor"),
? ?  Prenom = structure(c(6L, 3L, 3L, 4L, 1L, 2L, 2L, 7L, 5L,
? ?  5L, 5L), .Label = c("Edgar", "Elodie", "Jeannine", "Michel",
? ?  "Mich?le", "Michelle", "Victor"), class = "factor"), Sexe = 
structure(c(1L,
? ?  1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("F?minin",
? ?  "Masculin"), class = "factor"), Date.de.naissance = structure(c(4L,
? ?  2L, 2L, 7L, 6L, 5L, 5L, 1L, 3L, 3L, 3L), .Label = c("03/09/1940",
? ?  "04/03/1946", "07/12/1947", "18/11/1945", "27/09/1947", "29/12/1936",
? ?  "30/03/1935"), class = "factor")), .Names = c("Matricule",
"Nom", "Prenom", "Sexe", "Date.de.naissance"), class = "data.frame", 
row.names = c(NA,
-11L))

Thank for your helps
Michel
#
Hi Arun,
Merci ? toi
Bien amicalement
Michel
Le 24/07/2013 15:29, arun a ?crit :

  
    
#
Hi

For a dataframe with name PaysContrat1 and with
nrow(PaysContrat1)
[1] 52366

the test of system.time is :

system.time(droplevels(do.call(rbind,lapply(split(PaysContrat1,PaysContrat1$Matricule),
FUN=function(x) {x[,c("Nom","Pr?nom")] <- 
x[nrow(x),c("Nom","Pr?nom"),drop=TRUE];x}))))
    user  system elapsed
   14.03    0.00   14.04

system.time(droplevels(PaysContrat1[with(PaysContrat1,ave(seq_along(Matricule),Matricule,FUN=min)) 
,]  ))
    user  system elapsed
     0.2     0.0     0.2

Michel

Le 24/07/2013 15:29, arun a ?crit :

  
    
#
But I just noticed that the two solutions are not comparable :
the change concern only Nom and Prenom (solution Berend) and not also 
Sexe or Date.de.naissance orother variables (solution Arun) that can 
changed. But my question was badly put.
Michel

Le 25/07/2013 08:06, Arnaud Michel a ?crit :

  
    
#
On 25-07-2013, at 08:35, Arnaud Michel <michel.arnaud at cirad.fr> wrote:

            
Indeed:-)

But that can be remedied with (small correction w.r.t. initial solution: drop=TRUE removed; not relevant here)

r1 <- droplevels(do.call(rbind,lapply(split(TEST,TEST$Matricule),
                    FUN=function(x) {x[,1:ncol(x)] <- x[1,1:ncol(x)];x})))

and

r2 <- droplevels(do.call(rbind,lapply(split(TEST,TEST$Matricule),
                    FUN=function(x) {x[,1:ncol(x)] <- x[nrow(x),1:ncol(x)];x})))

Less elegant than alternative with ave

Berend
#
Le 25/07/2013 08:50, Berend Hasselman a ?crit :
Thank you but I keep
{x[,c("Nom","Pr?nom")] <- x[nrow(x),c("Nom","Pr?nom")];x} because in the 
dataframe there are other variables that I do not want to change. I want 
change only "Nom" and "Pr?nom"

PS : ?w.r.t.
Michel

  
    
#
Hi Michel,
Sorry, I misunderstood your question.
You could also try:
library(plyr)
df1New<-droplevels(ddply(TEST,.(Matricule),function(x) {x[,c("Nom","Prenom")]<- x[1,c("Nom","Prenom")];x}))
df2New<-droplevels(ddply(TEST,.(Matricule),function(x) {x[,c("Nom","Prenom")]<- x[nrow(x),c("Nom","Prenom")];x}))
?identical(df1,df1New)
#[1] TRUE
?identical(df2,df2New)
#[1] TRUE

#or

library(data.table)
dt1<- data.table(TEST)
dt2<-dt1
dt1[,Nom:=head(Nom,1),by=Matricule]
dt1[,Prenom:=head(Prenom,1),by=Matricule]
?identical(df1,droplevels(as.data.frame(dt1)))
#[1] TRUE

dt2[,Nom:=tail(Nom,1),by=Matricule]
dt2[,Prenom:=tail(Prenom,1),by=Matricule]
?identical(df2,droplevels(as.data.frame(dt2)))
#[1] TRUE


#If you are considering speed, then ?data.table() would be useful
set.seed(28)
dfTest<- as.data.frame(matrix(sample(1:50,1e6*5,replace=TRUE),ncol=5))

system.time({res1<-do.call(rbind,lapply(split(dfTest,dfTest$V1),
??????????????????? FUN=function(x) {x[,c("V2","V3")] <- x[1,c("V2","V3")];x}))})
#? user? system elapsed 
#? 4.452?? 0.036?? 4.499 
row.names(res1)<-1:nrow(res1)

dtNew<- data.table(dfTest)
system.time({dtNew[,V2:=head(V2,1),by=V1]
??? dtNew[,V3:=head(V3,1),by=V1]
??? ?dtNew<-dtNew[order(V1)]? #here, the dataset was not pre-sorted, so just to keep the same order as the above solution

??? })
# user? system elapsed 
#?? 0.132?? 0.000?? 0.133 
identical(res1,as.data.frame(dtNew))
#[1] TRUE


A.K.




----- Original Message -----
From: Arnaud Michel <michel.arnaud at cirad.fr>
To: Berend Hasselman <bhh at xs4all.nl>
Cc: R help <r-help at r-project.org>
Sent: Thursday, July 25, 2013 3:59 AM
Subject: Re: [R] Change values in a dateframe-Speed TEST

Le 25/07/2013 08:50, Berend Hasselman a ?crit :
Thank you but I keep
{x[,c("Nom","Pr?nom")] <- x[nrow(x),c("Nom","Pr?nom")];x} because in the 
dataframe there are other variables that I do not want to change. I want 
change only "Nom" and "Pr?nom"

PS : ?w.r.t.
Michel