Skip to content

simplify a dataframe

9 messages · Rui Barradas, jim holtman, Arnaud Michel +1 more

#
Hello

I have the following problem : group the lines of a dataframe when no 
information change (Matricule, Nom, Sexe, DateNaissance, Contrat, Pays) 
and when the value of Debut of lines i = value Fin of lines i-1
I can obtain it with a do loop. Is it possible to avoid the loop ?

The dataframe initial is df1
dput(df1)
structure(list(Matricule = c(1L, 1L, 1L, 6L, 6L, 6L, 6L, 6L,
6L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L), Nom = c("VERON", "VERON", "VERON", "BENARD",
"BENARD", "BENARD", "BENARD", "BENARD", "BENARD", "DALNIC", "DALNIC",
"DALNIC", "DALNIC", "DALNIC", "DALNIC", "DALNIC", "DALNIC", "DALNIC",
"FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI"), Sexe = c("F?minin", "F?minin", "F?minin",
"Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin",
"F?minin", "F?minin", "F?minin", "F?minin", "F?minin", "F?minin",
"F?minin", "F?minin", "F?minin", "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin"), DateNaissance = c("02/09/1935",
"02/09/1935", "02/09/1935", "01/04/1935", "01/04/1935", "01/04/1935",
"01/04/1935", "01/04/1935", "01/04/1935", "19/02/1940", "19/02/1940",
"19/02/1940", "19/02/1940", "19/02/1940", "19/02/1940", "19/02/1940",
"19/02/1940", "19/02/1940", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961"), contrat = c("CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad",
"CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad", "CDD d?tach? ext. 
Cirad",
"CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad", "CDD d?tach? ext. 
Cirad",
"CDD d?tach? ext. Cirad", "CDI D?tach?s Autres", "CDI D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun",
"CDI commun"), Pays = c("France", "France", "France", "Philippines",
"Philippines", "Philippines", "France", "France", "France", "France",
"France", "Martinique", "Martinique", "Martinique", "Martinique",
"Martinique", "Martinique", "Martinique", "Cameroun", "Cameroun",
"Cameroun", "Cameroun", "Cameroun", "Cameroun", "Cameroun", "Cameroun",
"Cameroun", "France", "France", "France", "France", "France",
"France", "France", "Congo", "Congo", "Congo", "Congo", "Congo",
"Congo", "Gabon", "Gabon", "Gabon", "Gabon", "Gabon", "Gabon",
"Congo", "Congo"), Debut = c("24/01/1995", "01/05/1997", "31/12/1997",
"02/02/1995", "28/02/1995", "01/03/1995", "13/03/1995", "01/01/1996",
"31/01/1996", "24/01/1995", "01/07/1995", "01/09/1995", "01/07/1997",
"01/01/1998", "01/08/1998", "01/01/2000", "17/01/2000", "29/02/2000",
"26/01/1995", "01/07/1996", "16/09/1997", "01/01/1998", "01/07/1998",
"04/11/1999", "01/01/2001", "01/04/2001", "31/08/2001", "01/09/2001",
"02/09/2001", "01/12/2001", "01/02/2003", "01/04/2003", "01/01/2004",
"01/03/2004", "01/09/2004", "01/01/2005", "01/04/2005", "28/10/2006",
"01/01/2007", "01/04/2007", "01/09/2007", "01/01/2009", "01/04/2009",
"01/01/2010", "01/01/2011", "01/04/2011", "05/09/2012", "01/01/2013"
), Fin = c("30/04/1997", "30/12/1997", "31/12/1997", "27/02/1995",
"28/02/1995", "12/03/1995", "30/06/1995", "30/01/1996", "31/01/1996",
"30/06/1995", "31/08/1995", "30/06/1997", "31/12/1997", "31/07/1998",
"31/12/1999", "16/01/2000", "28/02/2000", "29/02/2000", "30/06/1996",
"15/09/1997", "31/12/1997", "30/06/1998", "03/11/1999", "31/12/2000",
"31/03/2001", "30/08/2001", "31/08/2001", "01/09/2001", "30/11/2001",
"31/01/2003", "31/03/2003", "31/12/2003", "29/02/2004", "31/08/2004",
"31/12/2004", "31/03/2005", "27/10/2006", "31/12/2006", "31/03/2007",
"31/08/2007", "31/12/2008", "31/03/2009", "31/12/2009", "31/12/2010",
"31/03/2011", "04/09/2012", "31/12/2012", "31/12/4712")), .Names = 
c("Matricule",
"Nom", "Sexe", "DateNaissance", "contrat", "Pays", "Debut", "Fin"
), class = "data.frame", row.names = c(NA, -48L))

The dataframe to be obtained is df2
dput(df2)
structure(list(Mat = c(1L, 6L, 6L, 6L, 8L, 8L, 934L, 934L, 934L,
934L, 934L), Nom = c("VERON", "BENARD", "BENARD", "BENARD", "DALNIC",
"DALNIC", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI"), Sexe = c("F?minin",
"Masculin", "Masculin", "Masculin", "F?minin", "F?minin", "Masculin",
"Masculin", "Masculin", "Masculin", "Masculin"), DateNaissance = 
c("02/09/1935",
"01/04/1935", "01/04/1935", "01/04/1935", "19/02/1940", "19/02/1940",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961"
), contrat = c("CDI commun", "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDD d?tach? ext. Cirad", "CDI D?tach?s 
Autres",
"CDI D?tach?s Autres", "CDI commun", "CDI commun"), Pays = c("France",
"Philippines", "France", "France", "France", "Martinique", "Cameroun",
"France", "Congo", "Gabon", "Congo"), Debut = c("24/01/1995",
"02/02/1995", "13/03/1995", "01/01/1996", "24/01/1995", "01/09/1995",
"26/01/1995", "01/09/2001", "01/09/2004", "01/09/2007", "05/09/2012"
), Fin = c("31/12/1997", "12/03/1995", "30/06/1995", "31/01/1996",
"31/08/1995", "29/02/2000", "31/08/2001", "31/08/2004", "31/08/2007",
"04/09/2012", "31/12/4712")), .Names = c("Mat", "Nom", "Sexe",
"DateNaissance", "contrat", "Pays", "Debut", "Fin"), class = 
"data.frame", row.names = c(NA,
-11L))

Thank you for your help
#
Hello,

My solution is missing a row, but maybe you can find some inspiration.


cols <- c("Matricule", "Nom", "Sexe", "DateNaissance", "contrat", "Pays")
irow1 <- duplicated(df1[, cols])
irow2 <- c(FALSE, df1$Debut[-1] == df1$Fin[-nrow(df1)])

df3 <- df1[!irow1 & !irow2, ]

dim(df2); dim(df3)  # df3 has one row less
df2; df3


Hope this helps,

Rui Barradas

Em 12-07-2013 20:45, Arnaud Michel escreveu:
#
Hi,
"when the value of Debut of lines i = value Fin of lines i-1"
That part is not clear esp. when it is looked upon with the expected output (df2).? Also, in your example dataset:

df1$contrat[grep("^CDD",df1$contrat)]
#[1] "CDD d?tach? ext. Cirad" "CDD d?tach? ext. Cirad" "CDD d?tach? ext. Cirad"
#[4] "CDD d?tach? ext. Cirad" "CDD d?tach? ext.Cirad"? "CDD d?tach? ext. Cirad"
#[7] "CDD d?tach? ext. Cirad" "CDD d?tach? ext.Cirad"? "CDD d?tach? ext. Cirad"
##Looks like there are extra spaces in some of them.? I guess these are the same
df1$contrat[grep("^CDD",df1$contrat)]<- "CDD d?tach? ext. Cirad"


I tried this:
indx<-as.numeric(interaction(df1[,1:6],drop=FALSE))

?df1New<- df1
res2<-unique(within(df1New,{Debut<-ave(seq_along(indx),indx,FUN=function(x) Debut[head(x,1)]);Fin<- ave(seq_along(indx),indx,FUN=function(x) Fin[tail(x,1)])}))
?row.names(res2)<- 1:nrow(res2)

res2[,c(1,2,7:8)]
?? Matricule??? Nom????? Debut??????? Fin
1????????? 1? VERON 24/01/1995 31/12/1997
2????????? 6 BENARD 02/02/1995 12/03/1995
3????????? 6 BENARD 13/03/1995 31/01/1996 ###here not correct
4????????? 8 DALNIC 24/01/1995 31/08/1995
5????????? 8 DALNIC 01/09/1995 29/02/2000
6??????? 934? FORNI 26/01/1995 31/08/2001
7??????? 934? FORNI 01/09/2001 31/08/2004
8??????? 934? FORNI 01/09/2004 31/08/2007
9??????? 934? FORNI 01/09/2007 04/09/2012
10?????? 934? FORNI 05/09/2012 31/12/4712


df2[,c(1,2,7:8)]
?? Mat??? Nom????? Debut??????? Fin
1??? 1? VERON 24/01/1995 31/12/1997
2??? 6 BENARD 02/02/1995 12/03/1995
3??? 6 BENARD 13/03/1995 30/06/1995
4??? 6 BENARD 01/01/1996 31/01/1996 #missing this row 
5??? 8 DALNIC 24/01/1995 31/08/1995
6??? 8 DALNIC 01/09/1995 29/02/2000
7? 934? FORNI 26/01/1995 31/08/2001
8? 934? FORNI 01/09/2001 31/08/2004
9? 934? FORNI 01/09/2004 31/08/2007
10 934? FORNI 01/09/2007 04/09/2012
11 934? FORNI 05/09/2012 31/12/4712


Here, the dates look similar to the ones on df2 except for one row in df2.

A.K.




----- Original Message -----
From: Arnaud Michel <michel.arnaud at cirad.fr>
To: R help <r-help at r-project.org>
Cc: 
Sent: Friday, July 12, 2013 3:45 PM
Subject: [R] simplify a dataframe

Hello

I have the following problem : group the lines of a dataframe when no 
information change (Matricule, Nom, Sexe, DateNaissance, Contrat, Pays) 
and when the value of Debut of lines i = value Fin of lines i-1
I can obtain it with a do loop. Is it possible to avoid the loop ?

The dataframe initial is df1
dput(df1)
structure(list(Matricule = c(1L, 1L, 1L, 6L, 6L, 6L, 6L, 6L,
6L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L), Nom = c("VERON", "VERON", "VERON", "BENARD",
"BENARD", "BENARD", "BENARD", "BENARD", "BENARD", "DALNIC", "DALNIC",
"DALNIC", "DALNIC", "DALNIC", "DALNIC", "DALNIC", "DALNIC", "DALNIC",
"FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI"), Sexe = c("F?minin", "F?minin", "F?minin",
"Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin",
"F?minin", "F?minin", "F?minin", "F?minin", "F?minin", "F?minin",
"F?minin", "F?minin", "F?minin", "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin"), DateNaissance = c("02/09/1935",
"02/09/1935", "02/09/1935", "01/04/1935", "01/04/1935", "01/04/1935",
"01/04/1935", "01/04/1935", "01/04/1935", "19/02/1940", "19/02/1940",
"19/02/1940", "19/02/1940", "19/02/1940", "19/02/1940", "19/02/1940",
"19/02/1940", "19/02/1940", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961"), contrat = c("CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad",
"CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad", "CDD d?tach? ext. 
Cirad",
"CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad", "CDD d?tach? ext. 
Cirad",
"CDD d?tach? ext. Cirad", "CDI D?tach?s Autres", "CDI D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun",
"CDI commun"), Pays = c("France", "France", "France", "Philippines",
"Philippines", "Philippines", "France", "France", "France", "France",
"France", "Martinique", "Martinique", "Martinique", "Martinique",
"Martinique", "Martinique", "Martinique", "Cameroun", "Cameroun",
"Cameroun", "Cameroun", "Cameroun", "Cameroun", "Cameroun", "Cameroun",
"Cameroun", "France", "France", "France", "France", "France",
"France", "France", "Congo", "Congo", "Congo", "Congo", "Congo",
"Congo", "Gabon", "Gabon", "Gabon", "Gabon", "Gabon", "Gabon",
"Congo", "Congo"), Debut = c("24/01/1995", "01/05/1997", "31/12/1997",
"02/02/1995", "28/02/1995", "01/03/1995", "13/03/1995", "01/01/1996",
"31/01/1996", "24/01/1995", "01/07/1995", "01/09/1995", "01/07/1997",
"01/01/1998", "01/08/1998", "01/01/2000", "17/01/2000", "29/02/2000",
"26/01/1995", "01/07/1996", "16/09/1997", "01/01/1998", "01/07/1998",
"04/11/1999", "01/01/2001", "01/04/2001", "31/08/2001", "01/09/2001",
"02/09/2001", "01/12/2001", "01/02/2003", "01/04/2003", "01/01/2004",
"01/03/2004", "01/09/2004", "01/01/2005", "01/04/2005", "28/10/2006",
"01/01/2007", "01/04/2007", "01/09/2007", "01/01/2009", "01/04/2009",
"01/01/2010", "01/01/2011", "01/04/2011", "05/09/2012", "01/01/2013"
), Fin = c("30/04/1997", "30/12/1997", "31/12/1997", "27/02/1995",
"28/02/1995", "12/03/1995", "30/06/1995", "30/01/1996", "31/01/1996",
"30/06/1995", "31/08/1995", "30/06/1997", "31/12/1997", "31/07/1998",
"31/12/1999", "16/01/2000", "28/02/2000", "29/02/2000", "30/06/1996",
"15/09/1997", "31/12/1997", "30/06/1998", "03/11/1999", "31/12/2000",
"31/03/2001", "30/08/2001", "31/08/2001", "01/09/2001", "30/11/2001",
"31/01/2003", "31/03/2003", "31/12/2003", "29/02/2004", "31/08/2004",
"31/12/2004", "31/03/2005", "27/10/2006", "31/12/2006", "31/03/2007",
"31/08/2007", "31/12/2008", "31/03/2009", "31/12/2009", "31/12/2010",
"31/03/2011", "04/09/2012", "31/12/2012", "31/12/4712")), .Names = 
c("Matricule",
"Nom", "Sexe", "DateNaissance", "contrat", "Pays", "Debut", "Fin"
), class = "data.frame", row.names = c(NA, -48L))

The dataframe to be obtained is df2
dput(df2)
structure(list(Mat = c(1L, 6L, 6L, 6L, 8L, 8L, 934L, 934L, 934L,
934L, 934L), Nom = c("VERON", "BENARD", "BENARD", "BENARD", "DALNIC",
"DALNIC", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI"), Sexe = c("F?minin",
"Masculin", "Masculin", "Masculin", "F?minin", "F?minin", "Masculin",
"Masculin", "Masculin", "Masculin", "Masculin"), DateNaissance = 
c("02/09/1935",
"01/04/1935", "01/04/1935", "01/04/1935", "19/02/1940", "19/02/1940",
"10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961"
), contrat = c("CDI commun", "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDD d?tach? ext. Cirad", "CDI D?tach?s 
Autres",
"CDI D?tach?s Autres", "CDI commun", "CDI commun"), Pays = c("France",
"Philippines", "France", "France", "France", "Martinique", "Cameroun",
"France", "Congo", "Gabon", "Congo"), Debut = c("24/01/1995",
"02/02/1995", "13/03/1995", "01/01/1996", "24/01/1995", "01/09/1995",
"26/01/1995", "01/09/2001", "01/09/2004", "01/09/2007", "05/09/2012"
), Fin = c("31/12/1997", "12/03/1995", "30/06/1995", "31/01/1996",
"31/08/1995", "29/02/2000", "31/08/2001", "31/08/2004", "31/08/2007",
"04/09/2012", "31/12/4712")), .Names = c("Mat", "Nom", "Sexe",
"DateNaissance", "contrat", "Pays", "Debut", "Fin"), class = 
"data.frame", row.names = c(NA,
-11L))

Thank you for your help
1 day later
#
Hi,
May be this helps you.
df1$contrat[grep("^CDD",df1$contrat)]<- "CDD d?tach? ext. Cirad" 
df1[48,8]
[1] "31/12/4712" #strange value

df1[48,8]<- "31/12/2013"? #changed

indx<-as.numeric(interaction(df1[,1:6],drop=TRUE))
res<-do.call(rbind,lapply(split(df1,indx),function(x) {x1<- as.Date(x$Debut,format="%d/%m/%Y");x2<- as.Date(x$Fin,format="%d/%m/%Y");do.call(rbind,lapply(split(x,cumsum(c(FALSE,(x1[-1]-x2[-nrow(x)])!=1))),function(x) data.frame(x[1,1:6],Debut=head(x$Debut,1),Fin=tail(x$Fin,1),stringsAsFactors=FALSE)))}))

?res[order(res$Matricule),]? #the order of rows is a bit different than df2.
??? Matricule??? Nom???? Sexe DateNaissance??????????????? contrat??????? Pays
5?????????? 1? VERON? F?minin??? 02/09/1935???????????? CDI commun????? France
4.0???????? 6 BENARD Masculin??? 01/04/1935???????????? CDI commun????? France
4.1???????? 6 BENARD Masculin??? 01/04/1935???????????? CDI commun????? France
10????????? 6 BENARD Masculin??? 01/04/1935???????????? CDI commun Philippines
6?????????? 8 DALNIC? F?minin??? 19/02/1940???????????? CDI commun????? France
9?????????? 8 DALNIC? F?minin??? 19/02/1940???????????? CDI commun? Martinique
1???????? 934? FORNI Masculin??? 10/07/1961 CDD d?tach? ext. Cirad??? Cameroun
2???????? 934? FORNI Masculin??? 10/07/1961???????????? CDI commun?????? Congo
3???????? 934? FORNI Masculin??? 10/07/1961??? CDI D?tach?s Autres?????? Congo
7???????? 934? FORNI Masculin??? 10/07/1961??? CDI D?tach?s Autres????? France
8???????? 934? FORNI Masculin??? 10/07/1961???????????? CDI commun?????? Gabon
???????? Debut??????? Fin
5?? 24/01/1995 31/12/1997
4.0 13/03/1995 30/06/1995
4.1 01/01/1996 31/01/1996
10? 02/02/1995 12/03/1995
6?? 24/01/1995 31/08/1995
9?? 01/09/1995 29/02/2000
1?? 26/01/1995 31/08/2001
2?? 05/09/2012 31/12/2013
3?? 01/09/2004 31/08/2007
7?? 01/09/2001 31/08/2004
8?? 01/09/2007 04/09/2012


A.K.
#
HI Michel,
This gives the same order as that of df2.
df1$contrat[grep("^CDD",df1$contrat)]<- "CDD d?tach? ext. Cirad"
df1[48,8]<- "31/12/2013"
indx<-as.numeric(interaction(df1[,1:6],drop=TRUE))
lst1<-split(df1,indx)
?lst2<-lst1[match(unique(indx),names(lst1))]
res<-do.call(rbind,lapply(lst2,function(x){x1<- as.Date(x$Debut,format="%d/%m/%Y");x2<- as.Date(x$Fin,format="%d/%m/%Y");do.call(rbind,lapply(split(x,cumsum(c(FALSE,(x1[-1]-x2[-nrow(x)])!=1))),function(x) data.frame(x[1,1:6],Debut=head(x$Debut,1),Fin=tail(x$Fin,1),stringsAsFactors=FALSE)))}))
?row.names(res)<- 1:nrow(res)
?df2[11,8]<- "31/12/2013"
?names(res)[1]<- "Mat"
?identical(res,df2)
#[1] TRUE


A.K.



----- Original Message -----
From: arun <smartpink111 at yahoo.com>
To: Arnaud Michel <michel.arnaud at cirad.fr>
Cc: R help <r-help at r-project.org>
Sent: Sunday, July 14, 2013 2:39 PM
Subject: Re: [R] simplify a dataframe

Hi,
May be this helps you.
df1$contrat[grep("^CDD",df1$contrat)]<- "CDD d?tach? ext. Cirad" 
df1[48,8]
[1] "31/12/4712" #strange value

df1[48,8]<- "31/12/2013"? #changed

indx<-as.numeric(interaction(df1[,1:6],drop=TRUE))
res<-do.call(rbind,lapply(split(df1,indx),function(x) {x1<- as.Date(x$Debut,format="%d/%m/%Y");x2<- as.Date(x$Fin,format="%d/%m/%Y");do.call(rbind,lapply(split(x,cumsum(c(FALSE,(x1[-1]-x2[-nrow(x)])!=1))),function(x) data.frame(x[1,1:6],Debut=head(x$Debut,1),Fin=tail(x$Fin,1),stringsAsFactors=FALSE)))}))

?res[order(res$Matricule),]? #the order of rows is a bit different than df2.
??? Matricule??? Nom???? Sexe DateNaissance??????????????? contrat??????? Pays
5?????????? 1? VERON? F?minin??? 02/09/1935???????????? CDI commun????? France
4.0???????? 6 BENARD Masculin??? 01/04/1935???????????? CDI commun????? France
4.1???????? 6 BENARD Masculin??? 01/04/1935???????????? CDI commun????? France
10????????? 6 BENARD Masculin??? 01/04/1935???????????? CDI commun Philippines
6?????????? 8 DALNIC? F?minin??? 19/02/1940???????????? CDI commun????? France
9?????????? 8 DALNIC? F?minin??? 19/02/1940???????????? CDI commun? Martinique
1???????? 934? FORNI Masculin??? 10/07/1961 CDD d?tach? ext. Cirad??? Cameroun
2???????? 934? FORNI Masculin??? 10/07/1961???????????? CDI commun?????? Congo
3???????? 934? FORNI Masculin??? 10/07/1961??? CDI D?tach?s Autres?????? Congo
7???????? 934? FORNI Masculin??? 10/07/1961??? CDI D?tach?s Autres????? France
8???????? 934? FORNI Masculin??? 10/07/1961???????????? CDI commun?????? Gabon
???????? Debut??????? Fin
5?? 24/01/1995 31/12/1997
4.0 13/03/1995 30/06/1995
4.1 01/01/1996 31/01/1996
10? 02/02/1995 12/03/1995
6?? 24/01/1995 31/08/1995
9?? 01/09/1995 29/02/2000
1?? 26/01/1995 31/08/2001
2?? 05/09/2012 31/12/2013
3?? 01/09/2004 31/08/2007
7?? 01/09/2001 31/08/2004
8?? 01/09/2007 04/09/2012


A.K.
#
Super !!!
Thank you very much Arun
Michel
Le 15/07/2013 03:47, arun a ?crit :