Back to formatted view
Raw Message

Message-ID: <BC54CCC1-0447-42BC-86BA-45CC9B786D60@GMAIL.COM>
Date: 2011-09-18T04:51:36Z
From: Chuck
Subject: Indexing minute data with indexing package

I've been working with indexing this weekend and have been trying to  
best figure out how to use it. My main goal is to, one ticker at a  
time, load the data into R, then index it on disk using indexing /  
mmap.  Then, when I need it for analysis, I'd like to load it into  
memory for analysis. I've had some success with this, but can't get it  
to work quite the way I want, so I'd like to get some input here.

Here's the process I've been using:
---------
library(indexing)
library(quantmod)
library(TTR)
tkr = "SPY"

# First import the data from CSV file
headerCol = c("Date","Time","Open","High","Low","Close","Volume");
tmp = read.csv(paste(getDWD(),"\\",tkr,"_1.csv",sep=""), header =  
FALSE, stringsAsFactors = FALSE, col.names=headerCol);

tmp$Timestamp = unclass(as.POSIXct(paste(tmp$Date,tmp$Time), format =  
"%m/%d/%Y %H:%M"));

# I converted the timestamp to the unix form, because in order to  
store it on the disk using indexing it must be an integer. I then use  
the following function:

indexTmp = function(tmp, symbol, db=indexenv()) {
	cwd <- getwd();
	setwd(getIWD()); 		#getIWD() returns the directory where I wish to  
store the indexed binary files
	db <- create_index(as.vector(tmp 
$Timestamp),column=paste(symbol,".DateTime",sep=""))
	db <- create_index(tmp$Open,column=paste(symbol,".Open",sep=""))
	db <- create_index(tmp$High,column=paste(symbol,".High",sep=""))
	db <- create_index(tmp$Low,column=paste(symbol,".Low",sep=""))
	db <- create_index(tmp$Close,column=paste(symbol,".Close",sep=""))
	db <- create_index(tmp$Volume,column=paste(symbol,".Volume",sep=""))
	setwd(cwd);
}

# The data from the csv file is indexed on disk as follows:

indexTmp(tmp=tmp,symbol=tkr);
db <- indexenv()
ls(db)
[1] "SPY.Close"    "SPY.DateTime"	"SPY.High"     "SPY.Low"       
"SPY.Open"     "SPY.Volume"

# Now, I'm not sure if this is the best way to do this, but I can't  
really think of another way to do it given what I understand of the  
indexing structure. Ideally, I'd like to pull the data per ticker  
using the following routine:

getIdxData = function(symbol = "", startDate = "", endDate = "",  
fields=c("Open","High","Low","Close","Volume"),db=indexenv()) {
	cwd <- getwd();
	setwd(getIWD());

	# Turn date range into unix time
	sDate = as.numeric(round(unclass(as.POSIXct(startDate))[1]));
	if (endDate == "") eDate =  
as.numeric(round(unclass(as.POSIXct(Sys.time()))[1])) else eDate =  
unclass(as.POSIXct(endDate))[1];

	# Now fetch the data in the index and construct an xts object from it
	dIdx =  
db 
[suppressWarnings 
(search_index 
(paste 
(symbol 
,".DateTime 
",sep 
= 
""),x 
= 
c 
(sDate 
,eDate),type=c(">=","<="))),eval(paste(symbol,".DateTime",sep=""))]
	dIdx=as.POSIXct(dIdx,origin="1970-01-01")

	tmp = NULL;
	cNames = NULL;
	for (field in fields) {
		tmp =  
cbind 
(tmp 
,db 
[suppressWarnings 
(search_index 
(paste 
(symbol 
,".DateTime 
",sep 
= 
""),x 
= 
c 
(sDate 
,eDate),type=c(">=","<="))),eval(paste(symbol,".",field,sep=""))]);
		cNames = c(cNames, paste(symbol,".",field,sep=""));
	}
	setwd(cwd);
	tmp = xts(tmp, order.by = dIdx);
	colnames(tmp) = cNames;
	return(tmp)
}

SPY<-getIdxData(symbol="SPY",startDate="1994-01-01")
Error in as.POSIXlt.character(x, tz, ...) :
   character string is not in a standard unambiguous format
---------

As you can see, a call to this routine results in failure. This is  
because the calls to 'dIdx = db[suppressWarnings(search_index' bit   
evaluates to a vector of NAs, so the as.POSIXct function doesn't work.  
Commenting out the call to as.POSIXct and having the function just  
return tmp before making it an xts object results in a vector with all  
NAs:
---------
SPY<-getIdxData(symbol="SPY",startDate="1997-01-01")
tail(SPY)
            SPY.Open SPY.High SPY.Low SPY.Close SPY.Volume
[1893702,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
[1893703,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
[1893704,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
[1893705,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
[1893706,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
[1893707,] "NA.NA"  "NA.NA"  "NA.NA" "NA.NA"   "NA.NA"
---------

However, if you do this run the body of the above getIdxData function  
line by line in the global environment, it is successful:
---------
symbol=tkr
startDate="1994-01-01"
endDate=""
fields=c("Open","High","Low","Close","Volume")
db=indexenv()

cwd <- getwd();
setwd(getIWD());

# Turn date range into unix time
sDate = as.numeric(round(unclass(as.POSIXct(startDate))[1]));
if (endDate == "") eDate =  
as.numeric(round(unclass(as.POSIXct(Sys.time()))[1])) else eDate =  
unclass(as.POSIXct(endDate))[1];

# Now fetch the data in the index and construct an xts object from it
dIdx =  
db 
[suppressWarnings 
(search_index 
(paste 
(symbol 
,".DateTime 
",sep 
= 
""),x 
= 
c 
(sDate 
,eDate),type=c(">=","<="))),eval(paste(symbol,".DateTime",sep=""))]
dIdx=as.POSIXct(dIdx,origin="1970-01-01")

tmp = NULL;
cNames = NULL;
for (field in fields) {
	tmp =  
cbind 
(tmp 
,db 
[suppressWarnings 
(search_index 
(paste 
(symbol 
,".DateTime 
",sep 
= 
""),x 
= 
c 
(sDate 
,eDate),type=c(">=","<="))),eval(paste(symbol,".",field,sep=""))]);
	cNames = c(cNames, paste(symbol,".",field,sep=""));
}
setwd(cwd);
tmp = xts(tmp, order.by = dIdx);
colnames(tmp) = cNames;
tail(tmp)
                     SPY.Open SPY.High SPY.Low SPY.Close SPY.Volume
2011-09-03 00:54:00   117.41   117.41  117.41    117.41        100
2011-09-03 00:55:00   117.42   117.42  117.40    117.41       2520
2011-09-03 00:56:00   117.41   117.41  117.40    117.40       2800
2011-09-03 00:57:00   117.41   117.41  117.41    117.41       1685
2011-09-03 00:58:00   117.42   117.42  117.42    117.42        500
2011-09-03 00:59:00   117.42   117.42  117.40    117.42       6418
---------

Apologies for the long post, but there was a lot of sample code to  
provide. My questions are as follows:

1) Why doesn't my getIdxData() function work properly? I'm pretty sure  
it has to do with the way the indexing function evaluates it's  
arguments. I had to use the eval() function in order to trick the  
index into accepting parametrized symbols. Perhaps there's some tricky  
environment / scope issue going on here.
2) For those of you using the index package, what approach would you  
recommend to use for storing equity data like this?

I understand the indexing package is very alpha, and I appreciate the  
work that has gone into it. It seems to be perfect for my purposes, as  
I would like to have many series of minute data at the ready to load  
into memory and analyze on demand.  Hopefully I can get some help here  
to resolve this. Thanks!