On Behalf Of Mohammad Alimohammadi
Sent: Wednesday, May 27, 2015 1:47 PM
To: John Kane; r-help at r-project.org<mailto:r-help at r-project.org>
Subject: Re: [R] Problem with comparing multiple data sets
Ok. so I read about the ("modeest") package that gives the results that I
am looking for (most repeated value).
I modified the data frame a little and moved the text to the first
column.
This is the data frame with all 3 possible classes for each term.
=================================
structure(list(terms = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L,
4L, 4L, 4L, 3L, 3L, 3L, 3L, 2L, 2L, 2L), .Label = c("#dac",
"#mac,#security",
"accountability,anonymous", "data security,encryption,security"
), class = "factor"), class.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L), class.2 = c(2L, 2L,
2L, 2L, 0L, 0L, 2L, 0L, 0L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L,
0L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 0L, 0L, 0L, 0L, 1L, 1L, 1L),
class.3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 1L, 1L, 1L, 1L,
0L, 0L, 0L, 0L, 2L, 1L, 2L)), .Names = c("terms", "class.1",
"class.2", "class.3"), class = "data.frame", row.names = c(NA,
-49L))
=============================================
#Then I applied the function below:
======================
library(modeest)
df<- read.csv(file="short.csv", head= TRUE, sep=",")
apply(df[ ,2:length(df)], 1, mfv)
============================
# It gives the most frequent value for each row which is what I need. The
only problem is that all the values are displayed in one single row.
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0
0 0 2 1 1 1 1 0 0 0 0 2 1 2
It would be much better to show them in separate rows.
For example:
[1] 0
[2] 0
[3] 1
....
Any idea how to do this?
On Wed, May 27, 2015 at 10:11 AM, Mohammad Alimohammadi <
mxalimohamma at ualr.edu<mailto:mxalimohamma at ualr.edu>> wrote:
Hi Jim,
Thank you for your advice.
I'm not sure how to exactly incorporate this function though. I added a
portion of the actual data sets. all 3 data sets have the same items
(text)
with different class values. So I need to assign the most repeated class
(0,1,2) for each text.
For example: if line1 has text "aaa". It may be assigned to class 0 in
dat1, 2 in dat 2 and 0 in dat3. in this case the "aaa" will be assigned
to
0 (most repeated value). So it goes for each text.
I really appreciate your help.
=========================================
*dat1*
structure(list(class.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L), terms = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 2L, 2L, 2L), .Label =
c("#dac",
"#mac,#security", "accountability,anonymous", "data
security,encryption,security"
), class = "factor")), .Names = c("class.1", "terms"), class =
"data.frame", row.names = c(NA,
-49L))
*dat2*
structure(list(class.2 = c(2L, 2L, 2L, 2L, 0L, 0L, 2L, 0L, 0L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 2L, 2L, 2L, 1L, 1L, 2L,
2L, 0L, 0L, 0L, 0L, 1L, 1L, 1L), terms = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 2L, 2L, 2L), .Label =
c("#dac",
"#mac,#security", "accountability,anonymous", "data
security,encryption,security"
), class = "factor")), .Names = c("class.2", "terms"), class =
"data.frame", row.names = c(NA,
-49L))
*dat3*
structure(list(class.3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 1L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 2L, 1L, 2L), terms = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 2L, 2L, 2L), .Label =
c("#dac",
"#mac,#security", "accountability,anonymous", "data
security,encryption,security"
), class = "factor")), .Names = c("class.3", "terms"), class =
"data.frame", row.names = c(NA,
-49L))
===========================================================
On Sun, May 24, 2015 at 1:15 AM, Jim Lemon
<drjimlemon at gmail.com<mailto:drjimlemon at gmail.com>> wrote:
Hi Mohammad,
You know, I thought this would be fairly easy, but it wasn't really.
df1<-data.frame(Class=c(0,2,1),Comment=c("com1","com2","com3"),
Term=c("aac","aax","vvx"),Text=c("text1","text2","text3"))
df2<-data.frame(Class=c(0,2,1),Comment=c("com1","com2","com3"),
Term=c("aac","aax","vvx"),Text=c("text1","text2","text3"))
df3<-data.frame(Class=c(2,1,0),Comment=c("com1","com2","com3"),
Term=c("aac","aax","vvx"),Text=c("text1","text2","text3"))
dflist<-list(df1,df2,df3)
dflist
# define a function that extracts the value from one field
# selected by a value in another field
extract_by_value<-function(x,field1,value1,field2) {
return(x[x[,field1]==value1,field2])
}
# define another function that equates all of the values
sub_value<-function(x,field1,value1,field2,value2) {
x[x[,field1]==value1,field2]<-value2
return(x)
}
conformity<-function(x,fieldname1,value1,fieldname2) {
# get the most frequent value in fieldname2
# for the desired value in fieldname1
most_freq<-as.numeric(names(which.max(table(unlist(lapply(x,
extract_by_value,fieldname1,value1,fieldname2))))))
# now set all the values to the most frequent
for(i in 1:length(x))
x[[i]]<-sub_value(x[[i]],fieldname1,value1,fieldname2,most_freq)
return(x)
}
conformity(dflist,"Text","text1","Class")
Jim
On Sat, May 23, 2015 at 11:23 PM, John Kane
<jrkrideau at inbox.com<mailto:jrkrideau at inbox.com>> wrote:
Hi Mohammad
Welcome to the R-help list.
There probably is a fairly easy way to what you want but I think we
probably need a bit more background information on what you are trying
to
achieve. I know I'm not exactly clear on your decision rule(s).
It would also be very useful to see some actual sample data in useable
In particular, read up about dput() in those links and/or see ?dput.
This is the generally preferred way to supply sample or illustrative
data
to the R-help list. It basically creates a perfect copy of the data as
it
exists on 'your' machine so that R-help readers see exactly what you
do.
John Kane
Kingston ON Canada
-----Original Message-----
From: mxalimohamma at ualr.edu<mailto:mxalimohamma at ualr.edu>
Sent: Fri, 22 May 2015 12:37:50 -0500
To: r-help at r-project.org<mailto:r-help at r-project.org>
Subject: [R] Problem with comparing multiple data sets
Hi everyone,
I am very new to R and I have a task to do. I appreciate any help. I
3
data sets. Each data set has 4 columns. For example:
Class Comment Term Text
0 com1 aac text1
2 com2 aax text2
1 com3 vvx text3
Now I need t compare the class section between 3 data sets and assign
most available class to that text. For example if text1 is assigned
to
class 0 in data set 1&2 but assigned as 2 in data set 3 then it
should
assigned to class 0. If they are all the same so the class will be
the
same. The ideal thing would be to keep the same format and just
update
the
class. Is there any easy way to do this?
Thanks a lot.
[[alternative HTML version deleted]]