Message-ID: <BC54CCC1-0447-42BC-86BA-45CC9B786D60@GMAIL.COM>
Date: 2011-09-18T04:51:36Z
From: Chuck
Subject: Indexing minute data with indexing package
I've been working with indexing this weekend and have been trying to
best figure out how to use it. My main goal is to, one ticker at a
time, load the data into R, then index it on disk using indexing /
mmap. Then, when I need it for analysis, I'd like to load it into
memory for analysis. I've had some success with this, but can't get it
to work quite the way I want, so I'd like to get some input here.
Here's the process I've been using:
---------
library(indexing)
library(quantmod)
library(TTR)
tkr = "SPY"
# First import the data from CSV file
headerCol = c("Date","Time","Open","High","Low","Close","Volume");
tmp = read.csv(paste(getDWD(),"\\",tkr,"_1.csv",sep=""), header =
FALSE, stringsAsFactors = FALSE, col.names=headerCol);
tmp$Timestamp = unclass(as.POSIXct(paste(tmp$Date,tmp$Time), format =
"%m/%d/%Y %H:%M"));
# I converted the timestamp to the unix form, because in order to
store it on the disk using indexing it must be an integer. I then use
the following function:
indexTmp = function(tmp, symbol, db=indexenv()) {
cwd <- getwd();
setwd(getIWD()); #getIWD() returns the directory where I wish to
store the indexed binary files
db <- create_index(as.vector(tmp
$Timestamp),column=paste(symbol,".DateTime",sep=""))
db <- create_index(tmp$Open,column=paste(symbol,".Open",sep=""))
db <- create_index(tmp$High,column=paste(symbol,".High",sep=""))
db <- create_index(tmp$Low,column=paste(symbol,".Low",sep=""))
db <- create_index(tmp$Close,column=paste(symbol,".Close",sep=""))
db <- create_index(tmp$Volume,column=paste(symbol,".Volume",sep=""))
setwd(cwd);
}
# The data from the csv file is indexed on disk as follows:
indexTmp(tmp=tmp,symbol=tkr);
db <- indexenv()
ls(db)
[1] "SPY.Close" "SPY.DateTime" "SPY.High" "SPY.Low"
"SPY.Open" "SPY.Volume"
# Now, I'm not sure if this is the best way to do this, but I can't
really think of another way to do it given what I understand of the
indexing structure. Ideally, I'd like to pull the data per ticker
using the following routine:
getIdxData = function(symbol = "", startDate = "", endDate = "",
fields=c("Open","High","Low","Close","Volume"),db=indexenv()) {
cwd <- getwd();
setwd(getIWD());
# Turn date range into unix time
sDate = as.numeric(round(unclass(as.POSIXct(startDate))[1]));
if (endDate == "") eDate =
as.numeric(round(unclass(as.POSIXct(Sys.time()))[1])) else eDate =
unclass(as.POSIXct(endDate))[1];
# Now fetch the data in the index and construct an xts object from it
dIdx =
db
[suppressWarnings
(search_index
(paste
(symbol
,".DateTime
",sep
=
""),x
=
c
(sDate
,eDate),type=c(">=","<="))),eval(paste(symbol,".DateTime",sep=""))]
dIdx=as.POSIXct(dIdx,origin="1970-01-01")
tmp = NULL;
cNames = NULL;
for (field in fields) {
tmp =
cbind
(tmp
,db
[suppressWarnings
(search_index
(paste
(symbol
,".DateTime
",sep
=
""),x
=
c
(sDate
,eDate),type=c(">=","<="))),eval(paste(symbol,".",field,sep=""))]);
cNames = c(cNames, paste(symbol,".",field,sep=""));
}
setwd(cwd);
tmp = xts(tmp, order.by = dIdx);
colnames(tmp) = cNames;
return(tmp)
}
SPY<-getIdxData(symbol="SPY",startDate="1994-01-01")
Error in as.POSIXlt.character(x, tz, ...) :
character string is not in a standard unambiguous format
---------
As you can see, a call to this routine results in failure. This is
because the calls to 'dIdx = db[suppressWarnings(search_index' bit
evaluates to a vector of NAs, so the as.POSIXct function doesn't work.
Commenting out the call to as.POSIXct and having the function just
return tmp before making it an xts object results in a vector with all
NAs:
---------
SPY<-getIdxData(symbol="SPY",startDate="1997-01-01")
tail(SPY)
SPY.Open SPY.High SPY.Low SPY.Close SPY.Volume
[1893702,] "NA.NA" "NA.NA" "NA.NA" "NA.NA" "NA.NA"
[1893703,] "NA.NA" "NA.NA" "NA.NA" "NA.NA" "NA.NA"
[1893704,] "NA.NA" "NA.NA" "NA.NA" "NA.NA" "NA.NA"
[1893705,] "NA.NA" "NA.NA" "NA.NA" "NA.NA" "NA.NA"
[1893706,] "NA.NA" "NA.NA" "NA.NA" "NA.NA" "NA.NA"
[1893707,] "NA.NA" "NA.NA" "NA.NA" "NA.NA" "NA.NA"
---------
However, if you do this run the body of the above getIdxData function
line by line in the global environment, it is successful:
---------
symbol=tkr
startDate="1994-01-01"
endDate=""
fields=c("Open","High","Low","Close","Volume")
db=indexenv()
cwd <- getwd();
setwd(getIWD());
# Turn date range into unix time
sDate = as.numeric(round(unclass(as.POSIXct(startDate))[1]));
if (endDate == "") eDate =
as.numeric(round(unclass(as.POSIXct(Sys.time()))[1])) else eDate =
unclass(as.POSIXct(endDate))[1];
# Now fetch the data in the index and construct an xts object from it
dIdx =
db
[suppressWarnings
(search_index
(paste
(symbol
,".DateTime
",sep
=
""),x
=
c
(sDate
,eDate),type=c(">=","<="))),eval(paste(symbol,".DateTime",sep=""))]
dIdx=as.POSIXct(dIdx,origin="1970-01-01")
tmp = NULL;
cNames = NULL;
for (field in fields) {
tmp =
cbind
(tmp
,db
[suppressWarnings
(search_index
(paste
(symbol
,".DateTime
",sep
=
""),x
=
c
(sDate
,eDate),type=c(">=","<="))),eval(paste(symbol,".",field,sep=""))]);
cNames = c(cNames, paste(symbol,".",field,sep=""));
}
setwd(cwd);
tmp = xts(tmp, order.by = dIdx);
colnames(tmp) = cNames;
tail(tmp)
SPY.Open SPY.High SPY.Low SPY.Close SPY.Volume
2011-09-03 00:54:00 117.41 117.41 117.41 117.41 100
2011-09-03 00:55:00 117.42 117.42 117.40 117.41 2520
2011-09-03 00:56:00 117.41 117.41 117.40 117.40 2800
2011-09-03 00:57:00 117.41 117.41 117.41 117.41 1685
2011-09-03 00:58:00 117.42 117.42 117.42 117.42 500
2011-09-03 00:59:00 117.42 117.42 117.40 117.42 6418
---------
Apologies for the long post, but there was a lot of sample code to
provide. My questions are as follows:
1) Why doesn't my getIdxData() function work properly? I'm pretty sure
it has to do with the way the indexing function evaluates it's
arguments. I had to use the eval() function in order to trick the
index into accepting parametrized symbols. Perhaps there's some tricky
environment / scope issue going on here.
2) For those of you using the index package, what approach would you
recommend to use for storing equity data like this?
I understand the indexing package is very alpha, and I appreciate the
work that has gone into it. It seems to be perfect for my purposes, as
I would like to have many series of minute data at the ready to load
into memory and analyze on demand. Hopefully I can get some help here
to resolve this. Thanks!