Reading many large files causes R to crash - Possible Bug in R 2.15.1 64-bit Ubuntu

An embedded and charset-unspecified text was scrubbed...
Name: not available
URL: <https://stat.ethz.ch/pipermail/r-devel/attachments/20120722/4b5b7d0b/attachment.pl>
Cross-posted on Stack Overflow:
http://stackoverflow.com/q/11596747/271616
--
Joshua Ulrich  |  about.me/joshuaulrich
FOSS Trading  |  www.fosstrading.com
I am reading several hundred files.  Anywhere from 50k-400k in size.  It
appears that when I read these files with R 2.15.1 the process will hang or
seg fault on the scan() call.  This does not happen on R 2.14.1.

This is happening on the precise build of Ubuntu.

I have included everything, but the issue appears to be when performing the
scan in the method parseTickData.

Below is the code.  Hopefully this is the right place to post.

parseTickDataFromDir = function(tickerDir, per, subper, fun) {

  tickerAbsFilenames = list.files(tickerDir,full.names=T)

  tickerNames = list.files(tickerDir,full.names=F)

  tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)

  pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), style = 3)

  for(i in 1:length(tickerAbsFilenames)) {

    # Grab Raw Tick Data

    dat.i = parseTickData(tickerAbsFilenames[i])

    #Sys.sleep(1)

    # Create Template

    dates <- unique(substr(as.character(index(dat.i)), 1,10))

    times <- rep("09:30:00", length(dates))

    openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")

    templateTimes <- NULL

    for (j in 1:length(openDateTimes)) {

      if (is.null(templateTimes)) {

        templateTimes <- openDateTimes[j] + 0:23400

      } else {

        templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)

      }

    }

    # Convert templateTimes to XTS, merge with data and convert NA's

    templateTimes <- as.xts(templateTimes)

    dat.i <- merge(dat.i, templateTimes, all=T)

    # If there is no data in the first print, we will have leading NA's.  So
set them to -1.

    # Since we do not want these values removed by to.period

    if (is.na(dat.i[1])) {

      dat.i[1] <- -1

    }

    # Fix remaining NA's

    dat.i <- na.locf(dat.i)

    # Convert to desired bucket size

    dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

    # Always use templated index, otherwise merge fails with other symbols

    index(dat.i) <- index(to.period(templateTimes, period=per, k=subper))

    # If there was missing data at open, set close to NA

    valsToChange <- which(dat.i[,"Open"] == -1)

    if (length(valsToChange) != 0) {

      dat.i[valsToChange, "Close"] <- NA

    }

    if(i == 1) {

      DAT = fun(dat.i)

    } else {

      DAT = merge(DAT,fun(dat.i))

    }

    setTxtProgressBar(pb, i)

  }

  close(pb)

  colnames(DAT) = tickerNames

  return(DAT)

}

parseTickData <- function(inputFile) {

  DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)

  index <- as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")

  DAT.xts <- xts(DAT.list$Close,index)

  DAT.xts <- make.index.unique(DAT.xts)

  return(DAT.xts)

}

        [[alternative HTML version deleted]]

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel
I am reading several hundred files.  Anywhere from 50k-400k in size.  It
appears that when I read these files with R 2.15.1 the process will hang or
seg fault on the scan() call.  This does not happen on R 2.14.1.
The code below doesn't do anything other than define a couple of 
functions.  Please simplify it to code that creates a file (or multiple 
files), reads it or them, and shows a bug.

If you can't do that, then gradually add the rest of the stuff from 
these functions into the mix until you figure out what is really causing 
the bug.

If you don't post code that allows us to reproduce the crash, it's 
really unlikely that we'll be able to fix it.

Duncan Murdoch

This is happening on the precise build of Ubuntu.

I have included everything, but the issue appears to be when performing the
scan in the method parseTickData.

Below is the code.  Hopefully this is the right place to post.

parseTickDataFromDir = function(tickerDir, per, subper, fun) {

   tickerAbsFilenames = list.files(tickerDir,full.names=T)

   tickerNames = list.files(tickerDir,full.names=F)

   tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)

   pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), style = 3)

   for(i in 1:length(tickerAbsFilenames)) {

     # Grab Raw Tick Data

     dat.i = parseTickData(tickerAbsFilenames[i])

     #Sys.sleep(1)

     # Create Template

     dates <- unique(substr(as.character(index(dat.i)), 1,10))

     times <- rep("09:30:00", length(dates))

     openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")

     templateTimes <- NULL

     for (j in 1:length(openDateTimes)) {

       if (is.null(templateTimes)) {

         templateTimes <- openDateTimes[j] + 0:23400

       } else {

         templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)

       }

     }

     # Convert templateTimes to XTS, merge with data and convert NA's

     templateTimes <- as.xts(templateTimes)

     dat.i <- merge(dat.i, templateTimes, all=T)

     # If there is no data in the first print, we will have leading NA's.  So
set them to -1.

     # Since we do not want these values removed by to.period

     if (is.na(dat.i[1])) {

       dat.i[1] <- -1

     }

     # Fix remaining NA's

     dat.i <- na.locf(dat.i)

     # Convert to desired bucket size

     dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

     # Always use templated index, otherwise merge fails with other symbols

     index(dat.i) <- index(to.period(templateTimes, period=per, k=subper))

     # If there was missing data at open, set close to NA

     valsToChange <- which(dat.i[,"Open"] == -1)

     if (length(valsToChange) != 0) {

       dat.i[valsToChange, "Close"] <- NA

     }

     if(i == 1) {

       DAT = fun(dat.i)

     } else {

       DAT = merge(DAT,fun(dat.i))

     }

     setTxtProgressBar(pb, i)

   }

   close(pb)

   colnames(DAT) = tickerNames

   return(DAT)

}

parseTickData <- function(inputFile) {

   DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)

   index <- as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")

   DAT.xts <- xts(DAT.list$Close,index)

   DAT.xts <- make.index.unique(DAT.xts)

   return(DAT.xts)

}

	[[alternative HTML version deleted]]

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

I've isolated the bug.  When the seg fault was produced there was an error
that memory had not been mapped.  Here is the odd part of the bug.  If you
comment out certain code and get a full run than comment in the code which
is causing the problem it will actually run.   So I think it is safe to
assume something wrong is taking place with memory allocation.  Example.
While testing, I have been able to get to a point where the code will run.
But if I reboot the machine and try again, the code will not run.

The bug itself is happening somewhere in XTS or ZOO.  I will gladly upload
the data files.  It is happening on the 10th data file which is only 225k
lines in size.

Below is the simplified code.  The call to either

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
index(dat.i) <- index(to.period(templateTimes, period=per, k=subper))

is what is causing R to hang or crash.  I have been able to replicate this
on Windows 7 64 bit and Ubuntu 64 bit.  Seems easiest to consistently
replicate from R Studio.

The code below will consistently replicate when the appropriate files are
used.

parseTickDataFromDir = function(tickerDir, per, subper) {
  tickerAbsFilenames = list.files(tickerDir,full.names=T)
  tickerNames = list.files(tickerDir,full.names=F)
  tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)
  pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), style = 3)

  for(i in 1:length(tickerAbsFilenames)) {
    dat.i = parseTickData(tickerAbsFilenames[i])
    dates <- unique(substr(as.character(index(dat.i)), 1,10))
    times <- rep("09:30:00", length(dates))
    openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")
    templateTimes <- NULL

    for (j in 1:length(openDateTimes)) {
      if (is.null(templateTimes)) {
        templateTimes <- openDateTimes[j] + 0:23400
      } else {
        templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)
      }
    }

    templateTimes <- as.xts(templateTimes)
    dat.i <- merge(dat.i, templateTimes, all=T)
    if (is.na(dat.i[1])) {
      dat.i[1] <- -1
    }
    dat.i <- na.locf(dat.i)
	dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
	index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))
    setTxtProgressBar(pb, i)
  }
  close(pb)
}

parseTickData <- function(inputFile) {
  DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)
  index <- as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")
  DAT.xts <- xts(DAT.list$Close,index)
  DAT.xts <- make.index.unique(DAT.xts)
  return(DAT.xts)
}

DATTick <- parseTickDataFromDir(tickerDirSecond, "seconds",10)

-----Original Message-----
From: Duncan Murdoch [mailto:murdoch.duncan at gmail.com] 
Sent: Sunday, July 22, 2012 4:48 PM
To: David Terk
Cc: r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - Possible Bug
in R 2.15.1 64-bit Ubuntu
I am reading several hundred files.  Anywhere from 50k-400k in size.  
It appears that when I read these files with R 2.15.1 the process will 
hang or seg fault on the scan() call.  This does not happen on R 2.14.1.
The code below doesn't do anything other than define a couple of functions.
Please simplify it to code that creates a file (or multiple files), reads it
or them, and shows a bug.

If you can't do that, then gradually add the rest of the stuff from these
functions into the mix until you figure out what is really causing the bug.

If you don't post code that allows us to reproduce the crash, it's really
unlikely that we'll be able to fix it.

Duncan Murdoch

This is happening on the precise build of Ubuntu.

I have included everything, but the issue appears to be when 
performing the scan in the method parseTickData.

Below is the code.  Hopefully this is the right place to post.

parseTickDataFromDir = function(tickerDir, per, subper, fun) {

   tickerAbsFilenames = list.files(tickerDir,full.names=T)

   tickerNames = list.files(tickerDir,full.names=F)

   tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)

   pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), 
style = 3)

   for(i in 1:length(tickerAbsFilenames)) {

     # Grab Raw Tick Data

     dat.i = parseTickData(tickerAbsFilenames[i])

     #Sys.sleep(1)

     # Create Template

     dates <- unique(substr(as.character(index(dat.i)), 1,10))

     times <- rep("09:30:00", length(dates))

     openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")

     templateTimes <- NULL

     for (j in 1:length(openDateTimes)) {

       if (is.null(templateTimes)) {

         templateTimes <- openDateTimes[j] + 0:23400

       } else {

         templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)

       }

     }

     # Convert templateTimes to XTS, merge with data and convert NA's

     templateTimes <- as.xts(templateTimes)

     dat.i <- merge(dat.i, templateTimes, all=T)

     # If there is no data in the first print, we will have leading 
NA's.  So set them to -1.

     # Since we do not want these values removed by to.period

     if (is.na(dat.i[1])) {

       dat.i[1] <- -1

     }

     # Fix remaining NA's

     dat.i <- na.locf(dat.i)

     # Convert to desired bucket size

     dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

     # Always use templated index, otherwise merge fails with other 
symbols

     index(dat.i) <- index(to.period(templateTimes, period=per, 
k=subper))

     # If there was missing data at open, set close to NA

     valsToChange <- which(dat.i[,"Open"] == -1)

     if (length(valsToChange) != 0) {

       dat.i[valsToChange, "Close"] <- NA

     }

     if(i == 1) {

       DAT = fun(dat.i)

     } else {

       DAT = merge(DAT,fun(dat.i))

     }

     setTxtProgressBar(pb, i)

   }

   close(pb)

   colnames(DAT) = tickerNames

   return(DAT)

}

parseTickData <- function(inputFile) {

   DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)

   index <- 
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")

   DAT.xts <- xts(DAT.list$Close,index)

   DAT.xts <- make.index.unique(DAT.xts)

   return(DAT.xts)

}

	[[alternative HTML version deleted]]

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

Looks like the call to:

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

If what is causing the issue.  If variable name is not set, or set to any
value other than NULL.  Than no hang occurs.  

-----Original Message-----
From: David Terk [mailto:david.terk at gmail.com] 
Sent: Monday, July 23, 2012 1:25 AM
To: 'Duncan Murdoch'
Cc: 'r-devel at r-project.org'
Subject: RE: [Rd] Reading many large files causes R to crash - Possible Bug
in R 2.15.1 64-bit Ubuntu

I've isolated the bug.  When the seg fault was produced there was an error
that memory had not been mapped.  Here is the odd part of the bug.  If you
comment out certain code and get a full run than comment in the code which
is causing the problem it will actually run.   So I think it is safe to
assume something wrong is taking place with memory allocation.  Example.
While testing, I have been able to get to a point where the code will run.
But if I reboot the machine and try again, the code will not run.

The bug itself is happening somewhere in XTS or ZOO.  I will gladly upload
the data files.  It is happening on the 10th data file which is only 225k
lines in size.

Below is the simplified code.  The call to either

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
index(dat.i) <- index(to.period(templateTimes, period=per, k=subper))

is what is causing R to hang or crash.  I have been able to replicate this
on Windows 7 64 bit and Ubuntu 64 bit.  Seems easiest to consistently
replicate from R Studio.

The code below will consistently replicate when the appropriate files are
used.

parseTickDataFromDir = function(tickerDir, per, subper) {
  tickerAbsFilenames = list.files(tickerDir,full.names=T)
  tickerNames = list.files(tickerDir,full.names=F)
  tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)
  pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), style = 3)

  for(i in 1:length(tickerAbsFilenames)) {
    dat.i = parseTickData(tickerAbsFilenames[i])
    dates <- unique(substr(as.character(index(dat.i)), 1,10))
    times <- rep("09:30:00", length(dates))
    openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")
    templateTimes <- NULL

    for (j in 1:length(openDateTimes)) {
      if (is.null(templateTimes)) {
        templateTimes <- openDateTimes[j] + 0:23400
      } else {
        templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)
      }
    }

    templateTimes <- as.xts(templateTimes)
    dat.i <- merge(dat.i, templateTimes, all=T)
    if (is.na(dat.i[1])) {
      dat.i[1] <- -1
    }
    dat.i <- na.locf(dat.i)
	dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
	index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))
    setTxtProgressBar(pb, i)
  }
  close(pb)
}

parseTickData <- function(inputFile) {
  DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)
  index <- as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")
  DAT.xts <- xts(DAT.list$Close,index)
  DAT.xts <- make.index.unique(DAT.xts)
  return(DAT.xts)
}

DATTick <- parseTickDataFromDir(tickerDirSecond, "seconds",10)

-----Original Message-----
From: Duncan Murdoch [mailto:murdoch.duncan at gmail.com]
Sent: Sunday, July 22, 2012 4:48 PM
To: David Terk
Cc: r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - Possible Bug
in R 2.15.1 64-bit Ubuntu
I am reading several hundred files.  Anywhere from 50k-400k in size.  
It appears that when I read these files with R 2.15.1 the process will 
hang or seg fault on the scan() call.  This does not happen on R 2.14.1.
The code below doesn't do anything other than define a couple of functions.
Please simplify it to code that creates a file (or multiple files), reads it
or them, and shows a bug.

If you can't do that, then gradually add the rest of the stuff from these
functions into the mix until you figure out what is really causing the bug.

If you don't post code that allows us to reproduce the crash, it's really
unlikely that we'll be able to fix it.

Duncan Murdoch

This is happening on the precise build of Ubuntu.

I have included everything, but the issue appears to be when 
performing the scan in the method parseTickData.

Below is the code.  Hopefully this is the right place to post.

parseTickDataFromDir = function(tickerDir, per, subper, fun) {

   tickerAbsFilenames = list.files(tickerDir,full.names=T)

   tickerNames = list.files(tickerDir,full.names=F)

   tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)

   pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), 
style = 3)

   for(i in 1:length(tickerAbsFilenames)) {

     # Grab Raw Tick Data

     dat.i = parseTickData(tickerAbsFilenames[i])

     #Sys.sleep(1)

     # Create Template

     dates <- unique(substr(as.character(index(dat.i)), 1,10))

     times <- rep("09:30:00", length(dates))

     openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")

     templateTimes <- NULL

     for (j in 1:length(openDateTimes)) {

       if (is.null(templateTimes)) {

         templateTimes <- openDateTimes[j] + 0:23400

       } else {

         templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)

       }

     }

     # Convert templateTimes to XTS, merge with data and convert NA's

     templateTimes <- as.xts(templateTimes)

     dat.i <- merge(dat.i, templateTimes, all=T)

     # If there is no data in the first print, we will have leading 
NA's.  So set them to -1.

     # Since we do not want these values removed by to.period

     if (is.na(dat.i[1])) {

       dat.i[1] <- -1

     }

     # Fix remaining NA's

     dat.i <- na.locf(dat.i)

     # Convert to desired bucket size

     dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

     # Always use templated index, otherwise merge fails with other 
symbols

     index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))

     # If there was missing data at open, set close to NA

     valsToChange <- which(dat.i[,"Open"] == -1)

     if (length(valsToChange) != 0) {

       dat.i[valsToChange, "Close"] <- NA

     }

     if(i == 1) {

       DAT = fun(dat.i)

     } else {

       DAT = merge(DAT,fun(dat.i))

     }

     setTxtProgressBar(pb, i)

   }

   close(pb)

   colnames(DAT) = tickerNames

   return(DAT)

}

parseTickData <- function(inputFile) {

   DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)

   index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")

   DAT.xts <- xts(DAT.list$Close,index)

   DAT.xts <- make.index.unique(DAT.xts)

   return(DAT.xts)

}

	[[alternative HTML version deleted]]

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

David,

You still haven't provided a reproducible example.  As Duncan already
said, "if you don't post code that allows us to reproduce the crash,
it's really unlikely that we'll be able to fix it."

And R-devel is not the appropriate venue to discuss this if it's truly
an issue with xts/zoo.

Best,
--
Joshua Ulrich  |  about.me/joshuaulrich
FOSS Trading  |  www.fosstrading.com
Looks like the call to:

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

If what is causing the issue.  If variable name is not set, or set to any
value other than NULL.  Than no hang occurs.

-----Original Message-----
From: David Terk [mailto:david.terk at gmail.com]
Sent: Monday, July 23, 2012 1:25 AM
To: 'Duncan Murdoch'
Cc: 'r-devel at r-project.org'
Subject: RE: [Rd] Reading many large files causes R to crash - Possible Bug
in R 2.15.1 64-bit Ubuntu

I've isolated the bug.  When the seg fault was produced there was an error
that memory had not been mapped.  Here is the odd part of the bug.  If you
comment out certain code and get a full run than comment in the code which
is causing the problem it will actually run.   So I think it is safe to
assume something wrong is taking place with memory allocation.  Example.
While testing, I have been able to get to a point where the code will run.
But if I reboot the machine and try again, the code will not run.

The bug itself is happening somewhere in XTS or ZOO.  I will gladly upload
the data files.  It is happening on the 10th data file which is only 225k
lines in size.

Below is the simplified code.  The call to either

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
index(dat.i) <- index(to.period(templateTimes, period=per, k=subper))

is what is causing R to hang or crash.  I have been able to replicate this
on Windows 7 64 bit and Ubuntu 64 bit.  Seems easiest to consistently
replicate from R Studio.

The code below will consistently replicate when the appropriate files are
used.

parseTickDataFromDir = function(tickerDir, per, subper) {
  tickerAbsFilenames = list.files(tickerDir,full.names=T)
  tickerNames = list.files(tickerDir,full.names=F)
  tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)
  pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), style = 3)

  for(i in 1:length(tickerAbsFilenames)) {
    dat.i = parseTickData(tickerAbsFilenames[i])
    dates <- unique(substr(as.character(index(dat.i)), 1,10))
    times <- rep("09:30:00", length(dates))
    openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")
    templateTimes <- NULL

    for (j in 1:length(openDateTimes)) {
      if (is.null(templateTimes)) {
        templateTimes <- openDateTimes[j] + 0:23400
      } else {
        templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)
      }
    }

    templateTimes <- as.xts(templateTimes)
    dat.i <- merge(dat.i, templateTimes, all=T)
    if (is.na(dat.i[1])) {
      dat.i[1] <- -1
    }
    dat.i <- na.locf(dat.i)
        dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
        index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))
    setTxtProgressBar(pb, i)
  }
  close(pb)
}

parseTickData <- function(inputFile) {
  DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)
  index <- as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")
  DAT.xts <- xts(DAT.list$Close,index)
  DAT.xts <- make.index.unique(DAT.xts)
  return(DAT.xts)
}

DATTick <- parseTickDataFromDir(tickerDirSecond, "seconds",10)

-----Original Message-----
From: Duncan Murdoch [mailto:murdoch.duncan at gmail.com]
Sent: Sunday, July 22, 2012 4:48 PM
To: David Terk
Cc: r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - Possible Bug
in R 2.15.1 64-bit Ubuntu

On 12-07-22 3:54 PM, David Terk wrote:
I am reading several hundred files.  Anywhere from 50k-400k in size.
It appears that when I read these files with R 2.15.1 the process will
hang or seg fault on the scan() call.  This does not happen on R 2.14.1.
The code below doesn't do anything other than define a couple of functions.
Please simplify it to code that creates a file (or multiple files), reads it
or them, and shows a bug.

If you can't do that, then gradually add the rest of the stuff from these
functions into the mix until you figure out what is really causing the bug.

If you don't post code that allows us to reproduce the crash, it's really
unlikely that we'll be able to fix it.

Duncan Murdoch

This is happening on the precise build of Ubuntu.

I have included everything, but the issue appears to be when
performing the scan in the method parseTickData.

Below is the code.  Hopefully this is the right place to post.

parseTickDataFromDir = function(tickerDir, per, subper, fun) {

   tickerAbsFilenames = list.files(tickerDir,full.names=T)

   tickerNames = list.files(tickerDir,full.names=F)

   tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)

   pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames),
style = 3)

   for(i in 1:length(tickerAbsFilenames)) {

     # Grab Raw Tick Data

     dat.i = parseTickData(tickerAbsFilenames[i])

     #Sys.sleep(1)

     # Create Template

     dates <- unique(substr(as.character(index(dat.i)), 1,10))

     times <- rep("09:30:00", length(dates))

     openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")

     templateTimes <- NULL

     for (j in 1:length(openDateTimes)) {

       if (is.null(templateTimes)) {

         templateTimes <- openDateTimes[j] + 0:23400

       } else {

         templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)

       }

     }

     # Convert templateTimes to XTS, merge with data and convert NA's

     templateTimes <- as.xts(templateTimes)

     dat.i <- merge(dat.i, templateTimes, all=T)

     # If there is no data in the first print, we will have leading
NA's.  So set them to -1.

     # Since we do not want these values removed by to.period

     if (is.na(dat.i[1])) {

       dat.i[1] <- -1

     }

     # Fix remaining NA's

     dat.i <- na.locf(dat.i)

     # Convert to desired bucket size

     dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

     # Always use templated index, otherwise merge fails with other
symbols

     index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))

     # If there was missing data at open, set close to NA

     valsToChange <- which(dat.i[,"Open"] == -1)

     if (length(valsToChange) != 0) {

       dat.i[valsToChange, "Close"] <- NA

     }

     if(i == 1) {

       DAT = fun(dat.i)

     } else {

       DAT = merge(DAT,fun(dat.i))

     }

     setTxtProgressBar(pb, i)

   }

   close(pb)

   colnames(DAT) = tickerNames

   return(DAT)

}

parseTickData <- function(inputFile) {

   DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)

   index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")

   DAT.xts <- xts(DAT.list$Close,index)

   DAT.xts <- make.index.unique(DAT.xts)

   return(DAT.xts)

}

      [[alternative HTML version deleted]]

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel
Where should this be discussed since it is definitely XTS related?  I will
gladly upload the simplified script + data files to whoever is maintaining
this part of the code.  Fortunately there is a workaround here.

-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com] 
Sent: Monday, July 23, 2012 8:15 AM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - Possible Bug
in R 2.15.1 64-bit Ubuntu

David,

You still haven't provided a reproducible example.  As Duncan already said,
"if you don't post code that allows us to reproduce the crash, it's really
unlikely that we'll be able to fix it."

And R-devel is not the appropriate venue to discuss this if it's truly an
issue with xts/zoo.

Best,
--
Joshua Ulrich  |  about.me/joshuaulrich
FOSS Trading  |  www.fosstrading.com
Looks like the call to:

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

If what is causing the issue.  If variable name is not set, or set to 
any value other than NULL.  Than no hang occurs.

-----Original Message-----
From: David Terk [mailto:david.terk at gmail.com]
Sent: Monday, July 23, 2012 1:25 AM
To: 'Duncan Murdoch'
Cc: 'r-devel at r-project.org'
Subject: RE: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

I've isolated the bug.  When the seg fault was produced there was an 
error that memory had not been mapped.  Here is the odd part of the 
bug.  If you comment out certain code and get a full run than comment in
the code which
is causing the problem it will actually run.   So I think it is safe to
assume something wrong is taking place with memory allocation.  Example.
While testing, I have been able to get to a point where the code will run.
But if I reboot the machine and try again, the code will not run.

The bug itself is happening somewhere in XTS or ZOO.  I will gladly 
upload the data files.  It is happening on the 10th data file which is 
only 225k lines in size.

Below is the simplified code.  The call to either

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
index(dat.i) <- index(to.period(templateTimes, period=per, k=subper))

is what is causing R to hang or crash.  I have been able to replicate 
this on Windows 7 64 bit and Ubuntu 64 bit.  Seems easiest to 
consistently replicate from R Studio.

The code below will consistently replicate when the appropriate files 
are used.

parseTickDataFromDir = function(tickerDir, per, subper) {
  tickerAbsFilenames = list.files(tickerDir,full.names=T)
  tickerNames = list.files(tickerDir,full.names=F)
  tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)
  pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), 
style = 3)

  for(i in 1:length(tickerAbsFilenames)) {
    dat.i = parseTickData(tickerAbsFilenames[i])
    dates <- unique(substr(as.character(index(dat.i)), 1,10))
    times <- rep("09:30:00", length(dates))
    openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")
    templateTimes <- NULL

    for (j in 1:length(openDateTimes)) {
      if (is.null(templateTimes)) {
        templateTimes <- openDateTimes[j] + 0:23400
      } else {
        templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)
      }
    }

    templateTimes <- as.xts(templateTimes)
    dat.i <- merge(dat.i, templateTimes, all=T)
    if (is.na(dat.i[1])) {
      dat.i[1] <- -1
    }
    dat.i <- na.locf(dat.i)
        dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
        index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))
    setTxtProgressBar(pb, i)
  }
  close(pb)
}

parseTickData <- function(inputFile) {
  DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)
  index <- 
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")
  DAT.xts <- xts(DAT.list$Close,index)
  DAT.xts <- make.index.unique(DAT.xts)
  return(DAT.xts)
}

DATTick <- parseTickDataFromDir(tickerDirSecond, "seconds",10)

-----Original Message-----
From: Duncan Murdoch [mailto:murdoch.duncan at gmail.com]
Sent: Sunday, July 22, 2012 4:48 PM
To: David Terk
Cc: r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

On 12-07-22 3:54 PM, David Terk wrote:
I am reading several hundred files.  Anywhere from 50k-400k in size.
It appears that when I read these files with R 2.15.1 the process 
will hang or seg fault on the scan() call.  This does not happen on R
2.14.1.
The code below doesn't do anything other than define a couple of
functions.
Please simplify it to code that creates a file (or multiple files), 
reads it or them, and shows a bug.

If you can't do that, then gradually add the rest of the stuff from 
these functions into the mix until you figure out what is really causing
the bug.
If you don't post code that allows us to reproduce the crash, it's 
really unlikely that we'll be able to fix it.

Duncan Murdoch

This is happening on the precise build of Ubuntu.

I have included everything, but the issue appears to be when 
performing the scan in the method parseTickData.

Below is the code.  Hopefully this is the right place to post.

parseTickDataFromDir = function(tickerDir, per, subper, fun) {

   tickerAbsFilenames = list.files(tickerDir,full.names=T)

   tickerNames = list.files(tickerDir,full.names=F)

   tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)

   pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), 
style = 3)

   for(i in 1:length(tickerAbsFilenames)) {

     # Grab Raw Tick Data

     dat.i = parseTickData(tickerAbsFilenames[i])

     #Sys.sleep(1)

     # Create Template

     dates <- unique(substr(as.character(index(dat.i)), 1,10))

     times <- rep("09:30:00", length(dates))

     openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")

     templateTimes <- NULL

     for (j in 1:length(openDateTimes)) {

       if (is.null(templateTimes)) {

         templateTimes <- openDateTimes[j] + 0:23400

       } else {

         templateTimes <- c(templateTimes, openDateTimes[j] + 
0:23400)

       }

     }

     # Convert templateTimes to XTS, merge with data and convert NA's

     templateTimes <- as.xts(templateTimes)

     dat.i <- merge(dat.i, templateTimes, all=T)

     # If there is no data in the first print, we will have leading 
NA's.  So set them to -1.

     # Since we do not want these values removed by to.period

     if (is.na(dat.i[1])) {

       dat.i[1] <- -1

     }

     # Fix remaining NA's

     dat.i <- na.locf(dat.i)

     # Convert to desired bucket size

     dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

     # Always use templated index, otherwise merge fails with other 
symbols

     index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))

     # If there was missing data at open, set close to NA

     valsToChange <- which(dat.i[,"Open"] == -1)

     if (length(valsToChange) != 0) {

       dat.i[valsToChange, "Close"] <- NA

     }

     if(i == 1) {

       DAT = fun(dat.i)

     } else {

       DAT = merge(DAT,fun(dat.i))

     }

     setTxtProgressBar(pb, i)

   }

   close(pb)

   colnames(DAT) = tickerNames

   return(DAT)

}

parseTickData <- function(inputFile) {

   DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)

   index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")

   DAT.xts <- xts(DAT.list$Close,index)

   DAT.xts <- make.index.unique(DAT.xts)

   return(DAT.xts)

}

      [[alternative HTML version deleted]]

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel
Well, you still haven't convinced anyone but yourself that it's
definitely an xts problem, since you have not provided any
reproducible example...
--
Joshua Ulrich  |  about.me/joshuaulrich
FOSS Trading  |  www.fosstrading.com
Where should this be discussed since it is definitely XTS related?  I will
gladly upload the simplified script + data files to whoever is maintaining
this part of the code.  Fortunately there is a workaround here.

-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com]
Sent: Monday, July 23, 2012 8:15 AM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - Possible Bug
in R 2.15.1 64-bit Ubuntu

David,

You still haven't provided a reproducible example.  As Duncan already said,
"if you don't post code that allows us to reproduce the crash, it's really
unlikely that we'll be able to fix it."

And R-devel is not the appropriate venue to discuss this if it's truly an
issue with xts/zoo.

Best,
--
Joshua Ulrich  |  about.me/joshuaulrich
FOSS Trading  |  www.fosstrading.com

On Mon, Jul 23, 2012 at 12:41 AM, David Terk <david.terk at gmail.com> wrote:
Looks like the call to:

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

If what is causing the issue.  If variable name is not set, or set to
any value other than NULL.  Than no hang occurs.

-----Original Message-----
From: David Terk [mailto:david.terk at gmail.com]
Sent: Monday, July 23, 2012 1:25 AM
To: 'Duncan Murdoch'
Cc: 'r-devel at r-project.org'
Subject: RE: [Rd] Reading many large files causes R to crash -
Possible Bug in R 2.15.1 64-bit Ubuntu

I've isolated the bug.  When the seg fault was produced there was an
error that memory had not been mapped.  Here is the odd part of the
bug.  If you comment out certain code and get a full run than comment in
the code which
is causing the problem it will actually run.   So I think it is safe to
assume something wrong is taking place with memory allocation.  Example.
While testing, I have been able to get to a point where the code will run.
But if I reboot the machine and try again, the code will not run.

The bug itself is happening somewhere in XTS or ZOO.  I will gladly
upload the data files.  It is happening on the 10th data file which is
only 225k lines in size.

Below is the simplified code.  The call to either

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
index(dat.i) <- index(to.period(templateTimes, period=per, k=subper))

is what is causing R to hang or crash.  I have been able to replicate
this on Windows 7 64 bit and Ubuntu 64 bit.  Seems easiest to
consistently replicate from R Studio.

The code below will consistently replicate when the appropriate files
are used.

parseTickDataFromDir = function(tickerDir, per, subper) {
  tickerAbsFilenames = list.files(tickerDir,full.names=T)
  tickerNames = list.files(tickerDir,full.names=F)
  tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)
  pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames),
style = 3)

  for(i in 1:length(tickerAbsFilenames)) {
    dat.i = parseTickData(tickerAbsFilenames[i])
    dates <- unique(substr(as.character(index(dat.i)), 1,10))
    times <- rep("09:30:00", length(dates))
    openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")
    templateTimes <- NULL

    for (j in 1:length(openDateTimes)) {
      if (is.null(templateTimes)) {
        templateTimes <- openDateTimes[j] + 0:23400
      } else {
        templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)
      }
    }

    templateTimes <- as.xts(templateTimes)
    dat.i <- merge(dat.i, templateTimes, all=T)
    if (is.na(dat.i[1])) {
      dat.i[1] <- -1
    }
    dat.i <- na.locf(dat.i)
        dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
        index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))
    setTxtProgressBar(pb, i)
  }
  close(pb)
}

parseTickData <- function(inputFile) {
  DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)
  index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")
  DAT.xts <- xts(DAT.list$Close,index)
  DAT.xts <- make.index.unique(DAT.xts)
  return(DAT.xts)
}

DATTick <- parseTickDataFromDir(tickerDirSecond, "seconds",10)

-----Original Message-----
From: Duncan Murdoch [mailto:murdoch.duncan at gmail.com]
Sent: Sunday, July 22, 2012 4:48 PM
To: David Terk
Cc: r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash -
Possible Bug in R 2.15.1 64-bit Ubuntu

On 12-07-22 3:54 PM, David Terk wrote:
I am reading several hundred files.  Anywhere from 50k-400k in size.
It appears that when I read these files with R 2.15.1 the process
will hang or seg fault on the scan() call.  This does not happen on R
2.14.1.
The code below doesn't do anything other than define a couple of
functions.
Please simplify it to code that creates a file (or multiple files),
reads it or them, and shows a bug.

If you can't do that, then gradually add the rest of the stuff from
these functions into the mix until you figure out what is really causing
the bug.
If you don't post code that allows us to reproduce the crash, it's
really unlikely that we'll be able to fix it.

Duncan Murdoch

This is happening on the precise build of Ubuntu.

I have included everything, but the issue appears to be when
performing the scan in the method parseTickData.

Below is the code.  Hopefully this is the right place to post.

parseTickDataFromDir = function(tickerDir, per, subper, fun) {

   tickerAbsFilenames = list.files(tickerDir,full.names=T)

   tickerNames = list.files(tickerDir,full.names=F)

   tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)

   pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames),
style = 3)

   for(i in 1:length(tickerAbsFilenames)) {

     # Grab Raw Tick Data

     dat.i = parseTickData(tickerAbsFilenames[i])

     #Sys.sleep(1)

     # Create Template

     dates <- unique(substr(as.character(index(dat.i)), 1,10))

     times <- rep("09:30:00", length(dates))

     openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")

     templateTimes <- NULL

     for (j in 1:length(openDateTimes)) {

       if (is.null(templateTimes)) {

         templateTimes <- openDateTimes[j] + 0:23400

       } else {

         templateTimes <- c(templateTimes, openDateTimes[j] +
0:23400)

       }

     }

     # Convert templateTimes to XTS, merge with data and convert NA's

     templateTimes <- as.xts(templateTimes)

     dat.i <- merge(dat.i, templateTimes, all=T)

     # If there is no data in the first print, we will have leading
NA's.  So set them to -1.

     # Since we do not want these values removed by to.period

     if (is.na(dat.i[1])) {

       dat.i[1] <- -1

     }

     # Fix remaining NA's

     dat.i <- na.locf(dat.i)

     # Convert to desired bucket size

     dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

     # Always use templated index, otherwise merge fails with other
symbols

     index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))

     # If there was missing data at open, set close to NA

     valsToChange <- which(dat.i[,"Open"] == -1)

     if (length(valsToChange) != 0) {

       dat.i[valsToChange, "Close"] <- NA

     }

     if(i == 1) {

       DAT = fun(dat.i)

     } else {

       DAT = merge(DAT,fun(dat.i))

     }

     setTxtProgressBar(pb, i)

   }

   close(pb)

   colnames(DAT) = tickerNames

   return(DAT)

}

parseTickData <- function(inputFile) {

   DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)

   index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")

   DAT.xts <- xts(DAT.list$Close,index)

   DAT.xts <- make.index.unique(DAT.xts)

   return(DAT.xts)

}

      [[alternative HTML version deleted]]

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

I'm attaching a runnable script and corresponding data files.  This will
freeze at 83%.

I'm not sure how much simpler to get than this. 

-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com] 
Sent: Monday, July 23, 2012 9:17 AM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - Possible Bug
in R 2.15.1 64-bit Ubuntu

Well, you still haven't convinced anyone but yourself that it's definitely
an xts problem, since you have not provided any reproducible example...
--
Joshua Ulrich  |  about.me/joshuaulrich
FOSS Trading  |  www.fosstrading.com
Where should this be discussed since it is definitely XTS related?  I 
will gladly upload the simplified script + data files to whoever is 
maintaining this part of the code.  Fortunately there is a workaround
here.
-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com]
Sent: Monday, July 23, 2012 8:15 AM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

David,

You still haven't provided a reproducible example.  As Duncan already 
said, "if you don't post code that allows us to reproduce the crash, 
it's really unlikely that we'll be able to fix it."

And R-devel is not the appropriate venue to discuss this if it's truly 
an issue with xts/zoo.

Best,
--
Joshua Ulrich  |  about.me/joshuaulrich FOSS Trading  |  
www.fosstrading.com

On Mon, Jul 23, 2012 at 12:41 AM, David Terk <david.terk at gmail.com> wrote:
Looks like the call to:

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

If what is causing the issue.  If variable name is not set, or set to 
any value other than NULL.  Than no hang occurs.

-----Original Message-----
From: David Terk [mailto:david.terk at gmail.com]
Sent: Monday, July 23, 2012 1:25 AM
To: 'Duncan Murdoch'
Cc: 'r-devel at r-project.org'
Subject: RE: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

I've isolated the bug.  When the seg fault was produced there was an 
error that memory had not been mapped.  Here is the odd part of the 
bug.  If you comment out certain code and get a full run than comment 
in
the code which
is causing the problem it will actually run.   So I think it is safe to
assume something wrong is taking place with memory allocation.  Example.
While testing, I have been able to get to a point where the code will
run.
But if I reboot the machine and try again, the code will not run.

The bug itself is happening somewhere in XTS or ZOO.  I will gladly 
upload the data files.  It is happening on the 10th data file which 
is only 225k lines in size.

Below is the simplified code.  The call to either

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
index(dat.i) <- index(to.period(templateTimes, period=per, k=subper))

is what is causing R to hang or crash.  I have been able to replicate 
this on Windows 7 64 bit and Ubuntu 64 bit.  Seems easiest to 
consistently replicate from R Studio.

The code below will consistently replicate when the appropriate files 
are used.

parseTickDataFromDir = function(tickerDir, per, subper) {
  tickerAbsFilenames = list.files(tickerDir,full.names=T)
  tickerNames = list.files(tickerDir,full.names=F)
  tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)
  pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), 
style = 3)

  for(i in 1:length(tickerAbsFilenames)) {
    dat.i = parseTickData(tickerAbsFilenames[i])
    dates <- unique(substr(as.character(index(dat.i)), 1,10))
    times <- rep("09:30:00", length(dates))
    openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")
    templateTimes <- NULL

    for (j in 1:length(openDateTimes)) {
      if (is.null(templateTimes)) {
        templateTimes <- openDateTimes[j] + 0:23400
      } else {
        templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)
      }
    }

    templateTimes <- as.xts(templateTimes)
    dat.i <- merge(dat.i, templateTimes, all=T)
    if (is.na(dat.i[1])) {
      dat.i[1] <- -1
    }
    dat.i <- na.locf(dat.i)
        dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
        index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))
    setTxtProgressBar(pb, i)
  }
  close(pb)
}

parseTickData <- function(inputFile) {
  DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)
  index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")
  DAT.xts <- xts(DAT.list$Close,index)
  DAT.xts <- make.index.unique(DAT.xts)
  return(DAT.xts)
}

DATTick <- parseTickDataFromDir(tickerDirSecond, "seconds",10)

-----Original Message-----
From: Duncan Murdoch [mailto:murdoch.duncan at gmail.com]
Sent: Sunday, July 22, 2012 4:48 PM
To: David Terk
Cc: r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

On 12-07-22 3:54 PM, David Terk wrote:
I am reading several hundred files.  Anywhere from 50k-400k in size.
It appears that when I read these files with R 2.15.1 the process 
will hang or seg fault on the scan() call.  This does not happen on 
R
2.14.1.
The code below doesn't do anything other than define a couple of
functions.
Please simplify it to code that creates a file (or multiple files), 
reads it or them, and shows a bug.

If you can't do that, then gradually add the rest of the stuff from 
these functions into the mix until you figure out what is really 
causing
the bug.
If you don't post code that allows us to reproduce the crash, it's 
really unlikely that we'll be able to fix it.

Duncan Murdoch

This is happening on the precise build of Ubuntu.

I have included everything, but the issue appears to be when 
performing the scan in the method parseTickData.

Below is the code.  Hopefully this is the right place to post.

parseTickDataFromDir = function(tickerDir, per, subper, fun) {

   tickerAbsFilenames = list.files(tickerDir,full.names=T)

   tickerNames = list.files(tickerDir,full.names=F)

   tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)

   pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), 
style = 3)

   for(i in 1:length(tickerAbsFilenames)) {

     # Grab Raw Tick Data

     dat.i = parseTickData(tickerAbsFilenames[i])

     #Sys.sleep(1)

     # Create Template

     dates <- unique(substr(as.character(index(dat.i)), 1,10))

     times <- rep("09:30:00", length(dates))

     openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")

     templateTimes <- NULL

     for (j in 1:length(openDateTimes)) {

       if (is.null(templateTimes)) {

         templateTimes <- openDateTimes[j] + 0:23400

       } else {

         templateTimes <- c(templateTimes, openDateTimes[j] +
0:23400)

       }

     }

     # Convert templateTimes to XTS, merge with data and convert 
NA's

     templateTimes <- as.xts(templateTimes)

     dat.i <- merge(dat.i, templateTimes, all=T)

     # If there is no data in the first print, we will have leading 
NA's.  So set them to -1.

     # Since we do not want these values removed by to.period

     if (is.na(dat.i[1])) {

       dat.i[1] <- -1

     }

     # Fix remaining NA's

     dat.i <- na.locf(dat.i)

     # Convert to desired bucket size

     dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

     # Always use templated index, otherwise merge fails with other 
symbols

     index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))

     # If there was missing data at open, set close to NA

     valsToChange <- which(dat.i[,"Open"] == -1)

     if (length(valsToChange) != 0) {

       dat.i[valsToChange, "Close"] <- NA

     }

     if(i == 1) {

       DAT = fun(dat.i)

     } else {

       DAT = merge(DAT,fun(dat.i))

     }

     setTxtProgressBar(pb, i)

   }

   close(pb)

   colnames(DAT) = tickerNames

   return(DAT)

}

parseTickData <- function(inputFile) {

   DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)

   index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")

   DAT.xts <- xts(DAT.list$Close,index)

   DAT.xts <- make.index.unique(DAT.xts)

   return(DAT.xts)

}

      [[alternative HTML version deleted]]

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

David,

Thank you for providing something reproducible.

This line:
templateTimes <- as.xts(templateTimes)

creates a zero-width xts object (i.e. the coredata is a zero-length
vector, but there is a non-zero-length index). So, the
to.period(templateTimes) call returns OHLC data of random memory
locations.  This is the likely cause of the segfaults.

Since aggregating "no data" doesn't make sense, I have patched
to.period to throw an error when run on zero-width/length objects
(revision 690 on R-Forge).  The attached file works with the CRAN
version of xts because it avoids the issue entirely.

Your script will still "hang" on the BAC_0.csv file because
as.character.POSIXt can take a long time.  Better to just call
format() directly (as I do in the attached file).

If you have any follow-up questions, please send them to R-SIG-Finance.

Best,
--
Joshua Ulrich  |  about.me/joshuaulrich
FOSS Trading  |  www.fosstrading.com
I'm attaching a runnable script and corresponding data files.  This will
freeze at 83%.

I'm not sure how much simpler to get than this.

-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com]
Sent: Monday, July 23, 2012 9:17 AM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - Possible Bug
in R 2.15.1 64-bit Ubuntu

Well, you still haven't convinced anyone but yourself that it's definitely
an xts problem, since you have not provided any reproducible example...
--
Joshua Ulrich  |  about.me/joshuaulrich
FOSS Trading  |  www.fosstrading.com

On Mon, Jul 23, 2012 at 8:14 AM, David Terk <david.terk at gmail.com> wrote:
Where should this be discussed since it is definitely XTS related?  I
will gladly upload the simplified script + data files to whoever is
maintaining this part of the code.  Fortunately there is a workaround
here.
-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com]
Sent: Monday, July 23, 2012 8:15 AM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash -
Possible Bug in R 2.15.1 64-bit Ubuntu

David,

You still haven't provided a reproducible example.  As Duncan already
said, "if you don't post code that allows us to reproduce the crash,
it's really unlikely that we'll be able to fix it."

And R-devel is not the appropriate venue to discuss this if it's truly
an issue with xts/zoo.

Best,
--
Joshua Ulrich  |  about.me/joshuaulrich FOSS Trading  |
www.fosstrading.com

On Mon, Jul 23, 2012 at 12:41 AM, David Terk <david.terk at gmail.com> wrote:
Looks like the call to:

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

If what is causing the issue.  If variable name is not set, or set to
any value other than NULL.  Than no hang occurs.

-----Original Message-----
From: David Terk [mailto:david.terk at gmail.com]
Sent: Monday, July 23, 2012 1:25 AM
To: 'Duncan Murdoch'
Cc: 'r-devel at r-project.org'
Subject: RE: [Rd] Reading many large files causes R to crash -
Possible Bug in R 2.15.1 64-bit Ubuntu

I've isolated the bug.  When the seg fault was produced there was an
error that memory had not been mapped.  Here is the odd part of the
bug.  If you comment out certain code and get a full run than comment
in
the code which
is causing the problem it will actually run.   So I think it is safe to
assume something wrong is taking place with memory allocation.  Example.
While testing, I have been able to get to a point where the code will
run.
But if I reboot the machine and try again, the code will not run.

The bug itself is happening somewhere in XTS or ZOO.  I will gladly
upload the data files.  It is happening on the 10th data file which
is only 225k lines in size.

Below is the simplified code.  The call to either

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
index(dat.i) <- index(to.period(templateTimes, period=per, k=subper))

is what is causing R to hang or crash.  I have been able to replicate
this on Windows 7 64 bit and Ubuntu 64 bit.  Seems easiest to
consistently replicate from R Studio.

The code below will consistently replicate when the appropriate files
are used.

parseTickDataFromDir = function(tickerDir, per, subper) {
  tickerAbsFilenames = list.files(tickerDir,full.names=T)
  tickerNames = list.files(tickerDir,full.names=F)
  tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)
  pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames),
style = 3)

  for(i in 1:length(tickerAbsFilenames)) {
    dat.i = parseTickData(tickerAbsFilenames[i])
    dates <- unique(substr(as.character(index(dat.i)), 1,10))
    times <- rep("09:30:00", length(dates))
    openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")
    templateTimes <- NULL

    for (j in 1:length(openDateTimes)) {
      if (is.null(templateTimes)) {
        templateTimes <- openDateTimes[j] + 0:23400
      } else {
        templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)
      }
    }

    templateTimes <- as.xts(templateTimes)
    dat.i <- merge(dat.i, templateTimes, all=T)
    if (is.na(dat.i[1])) {
      dat.i[1] <- -1
    }
    dat.i <- na.locf(dat.i)
        dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
        index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))
    setTxtProgressBar(pb, i)
  }
  close(pb)
}

parseTickData <- function(inputFile) {
  DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)
  index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")
  DAT.xts <- xts(DAT.list$Close,index)
  DAT.xts <- make.index.unique(DAT.xts)
  return(DAT.xts)
}

DATTick <- parseTickDataFromDir(tickerDirSecond, "seconds",10)

-----Original Message-----
From: Duncan Murdoch [mailto:murdoch.duncan at gmail.com]
Sent: Sunday, July 22, 2012 4:48 PM
To: David Terk
Cc: r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash -
Possible Bug in R 2.15.1 64-bit Ubuntu

On 12-07-22 3:54 PM, David Terk wrote:
I am reading several hundred files.  Anywhere from 50k-400k in size.
It appears that when I read these files with R 2.15.1 the process
will hang or seg fault on the scan() call.  This does not happen on
R
2.14.1.
The code below doesn't do anything other than define a couple of
functions.
Please simplify it to code that creates a file (or multiple files),
reads it or them, and shows a bug.

If you can't do that, then gradually add the rest of the stuff from
these functions into the mix until you figure out what is really
causing
the bug.
If you don't post code that allows us to reproduce the crash, it's
really unlikely that we'll be able to fix it.

Duncan Murdoch

This is happening on the precise build of Ubuntu.

I have included everything, but the issue appears to be when
performing the scan in the method parseTickData.

Below is the code.  Hopefully this is the right place to post.

parseTickDataFromDir = function(tickerDir, per, subper, fun) {

   tickerAbsFilenames = list.files(tickerDir,full.names=T)

   tickerNames = list.files(tickerDir,full.names=F)

   tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)

   pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames),
style = 3)

   for(i in 1:length(tickerAbsFilenames)) {

     # Grab Raw Tick Data

     dat.i = parseTickData(tickerAbsFilenames[i])

     #Sys.sleep(1)

     # Create Template

     dates <- unique(substr(as.character(index(dat.i)), 1,10))

     times <- rep("09:30:00", length(dates))

     openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")

     templateTimes <- NULL

     for (j in 1:length(openDateTimes)) {

       if (is.null(templateTimes)) {

         templateTimes <- openDateTimes[j] + 0:23400

       } else {

         templateTimes <- c(templateTimes, openDateTimes[j] +
0:23400)

       }

     }

     # Convert templateTimes to XTS, merge with data and convert
NA's

     templateTimes <- as.xts(templateTimes)

     dat.i <- merge(dat.i, templateTimes, all=T)

     # If there is no data in the first print, we will have leading
NA's.  So set them to -1.

     # Since we do not want these values removed by to.period

     if (is.na(dat.i[1])) {

       dat.i[1] <- -1

     }

     # Fix remaining NA's

     dat.i <- na.locf(dat.i)

     # Convert to desired bucket size

     dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

     # Always use templated index, otherwise merge fails with other
symbols

     index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))

     # If there was missing data at open, set close to NA

     valsToChange <- which(dat.i[,"Open"] == -1)

     if (length(valsToChange) != 0) {

       dat.i[valsToChange, "Close"] <- NA

     }

     if(i == 1) {

       DAT = fun(dat.i)

     } else {

       DAT = merge(DAT,fun(dat.i))

     }

     setTxtProgressBar(pb, i)

   }

   close(pb)

   colnames(DAT) = tickerNames

   return(DAT)

}

parseTickData <- function(inputFile) {

   DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)

   index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")

   DAT.xts <- xts(DAT.list$Close,index)

   DAT.xts <- make.index.unique(DAT.xts)

   return(DAT.xts)

}

      [[alternative HTML version deleted]]

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

Thank you for getting this done so quickly.  This will process now.

One quick question regarding a call to as.character.POSIXt.  When using
scan, since scan reads line by line, would it make sense to have the ability
to perform a char -> POSIXct conversion on each line that is read, rather
than after all lines have been read?  Perhaps this already exists somewhere
and I am not aware of it.

-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com] 
Sent: Monday, July 23, 2012 12:00 PM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - Possible Bug
in R 2.15.1 64-bit Ubuntu

David,

Thank you for providing something reproducible.

This line:
templateTimes <- as.xts(templateTimes)

creates a zero-width xts object (i.e. the coredata is a zero-length vector,
but there is a non-zero-length index). So, the
to.period(templateTimes) call returns OHLC data of random memory locations.
This is the likely cause of the segfaults.

Since aggregating "no data" doesn't make sense, I have patched to.period to
throw an error when run on zero-width/length objects (revision 690 on
R-Forge).  The attached file works with the CRAN version of xts because it
avoids the issue entirely.

Your script will still "hang" on the BAC_0.csv file because
as.character.POSIXt can take a long time.  Better to just call
format() directly (as I do in the attached file).

If you have any follow-up questions, please send them to R-SIG-Finance.

Best,
--
Joshua Ulrich  |  about.me/joshuaulrich
FOSS Trading  |  www.fosstrading.com
I'm attaching a runnable script and corresponding data files.  This 
will freeze at 83%.

I'm not sure how much simpler to get than this.

-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com]
Sent: Monday, July 23, 2012 9:17 AM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

Well, you still haven't convinced anyone but yourself that it's 
definitely an xts problem, since you have not provided any reproducible
example...
--
Joshua Ulrich  |  about.me/joshuaulrich FOSS Trading  |  
www.fosstrading.com

On Mon, Jul 23, 2012 at 8:14 AM, David Terk <david.terk at gmail.com> wrote:
Where should this be discussed since it is definitely XTS related?  I 
will gladly upload the simplified script + data files to whoever is 
maintaining this part of the code.  Fortunately there is a workaround
here.
-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com]
Sent: Monday, July 23, 2012 8:15 AM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

David,

You still haven't provided a reproducible example.  As Duncan already 
said, "if you don't post code that allows us to reproduce the crash, 
it's really unlikely that we'll be able to fix it."

And R-devel is not the appropriate venue to discuss this if it's 
truly an issue with xts/zoo.

Best,
--
Joshua Ulrich  |  about.me/joshuaulrich FOSS Trading  | 
www.fosstrading.com

On Mon, Jul 23, 2012 at 12:41 AM, David Terk <david.terk at gmail.com>
wrote:
Looks like the call to:

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

If what is causing the issue.  If variable name is not set, or set 
to any value other than NULL.  Than no hang occurs.

-----Original Message-----
From: David Terk [mailto:david.terk at gmail.com]
Sent: Monday, July 23, 2012 1:25 AM
To: 'Duncan Murdoch'
Cc: 'r-devel at r-project.org'
Subject: RE: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

I've isolated the bug.  When the seg fault was produced there was an 
error that memory had not been mapped.  Here is the odd part of the 
bug.  If you comment out certain code and get a full run than 
comment in
the code which
is causing the problem it will actually run.   So I think it is safe to
assume something wrong is taking place with memory allocation.  Example.
While testing, I have been able to get to a point where the code 
will
run.
But if I reboot the machine and try again, the code will not run.

The bug itself is happening somewhere in XTS or ZOO.  I will gladly 
upload the data files.  It is happening on the 10th data file which 
is only 225k lines in size.

Below is the simplified code.  The call to either

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
index(dat.i) <- index(to.period(templateTimes, period=per, 
k=subper))

is what is causing R to hang or crash.  I have been able to 
replicate this on Windows 7 64 bit and Ubuntu 64 bit.  Seems easiest 
to consistently replicate from R Studio.

The code below will consistently replicate when the appropriate 
files are used.

parseTickDataFromDir = function(tickerDir, per, subper) {
  tickerAbsFilenames = list.files(tickerDir,full.names=T)
  tickerNames = list.files(tickerDir,full.names=F)
  tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)
  pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), 
style = 3)

  for(i in 1:length(tickerAbsFilenames)) {
    dat.i = parseTickData(tickerAbsFilenames[i])
    dates <- unique(substr(as.character(index(dat.i)), 1,10))
    times <- rep("09:30:00", length(dates))
    openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")
    templateTimes <- NULL

    for (j in 1:length(openDateTimes)) {
      if (is.null(templateTimes)) {
        templateTimes <- openDateTimes[j] + 0:23400
      } else {
        templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)
      }
    }

    templateTimes <- as.xts(templateTimes)
    dat.i <- merge(dat.i, templateTimes, all=T)
    if (is.na(dat.i[1])) {
      dat.i[1] <- -1
    }
    dat.i <- na.locf(dat.i)
        dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
        index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))
    setTxtProgressBar(pb, i)
  }
  close(pb)
}

parseTickData <- function(inputFile) {
  DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)
  index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")
  DAT.xts <- xts(DAT.list$Close,index)
  DAT.xts <- make.index.unique(DAT.xts)
  return(DAT.xts)
}

DATTick <- parseTickDataFromDir(tickerDirSecond, "seconds",10)

-----Original Message-----
From: Duncan Murdoch [mailto:murdoch.duncan at gmail.com]
Sent: Sunday, July 22, 2012 4:48 PM
To: David Terk
Cc: r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

On 12-07-22 3:54 PM, David Terk wrote:
I am reading several hundred files.  Anywhere from 50k-400k in size.
It appears that when I read these files with R 2.15.1 the process 
will hang or seg fault on the scan() call.  This does not happen on 
R
2.14.1.
The code below doesn't do anything other than define a couple of
functions.
Please simplify it to code that creates a file (or multiple files), 
reads it or them, and shows a bug.

If you can't do that, then gradually add the rest of the stuff from 
these functions into the mix until you figure out what is really 
causing
the bug.
If you don't post code that allows us to reproduce the crash, it's 
really unlikely that we'll be able to fix it.

Duncan Murdoch

This is happening on the precise build of Ubuntu.

I have included everything, but the issue appears to be when 
performing the scan in the method parseTickData.

Below is the code.  Hopefully this is the right place to post.

parseTickDataFromDir = function(tickerDir, per, subper, fun) {

   tickerAbsFilenames = list.files(tickerDir,full.names=T)

   tickerNames = list.files(tickerDir,full.names=F)

   tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)

   pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), 
style = 3)

   for(i in 1:length(tickerAbsFilenames)) {

     # Grab Raw Tick Data

     dat.i = parseTickData(tickerAbsFilenames[i])

     #Sys.sleep(1)

     # Create Template

     dates <- unique(substr(as.character(index(dat.i)), 1,10))

     times <- rep("09:30:00", length(dates))

     openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")

     templateTimes <- NULL

     for (j in 1:length(openDateTimes)) {

       if (is.null(templateTimes)) {

         templateTimes <- openDateTimes[j] + 0:23400

       } else {

         templateTimes <- c(templateTimes, openDateTimes[j] +
0:23400)

       }

     }

     # Convert templateTimes to XTS, merge with data and convert 
NA's

     templateTimes <- as.xts(templateTimes)

     dat.i <- merge(dat.i, templateTimes, all=T)

     # If there is no data in the first print, we will have leading 
NA's.  So set them to -1.

     # Since we do not want these values removed by to.period

     if (is.na(dat.i[1])) {

       dat.i[1] <- -1

     }

     # Fix remaining NA's

     dat.i <- na.locf(dat.i)

     # Convert to desired bucket size

     dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

     # Always use templated index, otherwise merge fails with other 
symbols

     index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))

     # If there was missing data at open, set close to NA

     valsToChange <- which(dat.i[,"Open"] == -1)

     if (length(valsToChange) != 0) {

       dat.i[valsToChange, "Close"] <- NA

     }

     if(i == 1) {

       DAT = fun(dat.i)

     } else {

       DAT = merge(DAT,fun(dat.i))

     }

     setTxtProgressBar(pb, i)

   }

   close(pb)

   colnames(DAT) = tickerNames

   return(DAT)

}

parseTickData <- function(inputFile) {

   DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)

   index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")

   DAT.xts <- xts(DAT.list$Close,index)

   DAT.xts <- make.index.unique(DAT.xts)

   return(DAT.xts)

}

      [[alternative HTML version deleted]]

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

One quick question regarding a call to as.character.POSIXt.  When using
scan, since scan reads line by line, would it make sense to have the ability
to perform a char -> POSIXct conversion on each line that is read, rather
than after all lines have been read?  Perhaps this already exists somewhere
and I am not aware of it.
It's actually much faster to load everything into memory and then 
convert it all to xts at once. as.POSIXct will work on a vector to 
create your index, this s better than calling it millions of times, once 
for each row.
Brian

Thank you for getting this done so quickly.  This will process now.

One quick question regarding a call to as.character.POSIXt.  When using
scan, since scan reads line by line, would it make sense to have the ability
to perform a char -> POSIXct conversion on each line that is read, rather
than after all lines have been read?
That's not the problem -- the problem is that converting through format specifications is very, very slow - if you have standard yyyy-mm-dd hh:mm:ss format (or a subset thereof) you can use fastPOSTXct from http://rforge.net/fasttime  - it's many orders of magnitude faster than using format-based conversions - but it is also limited to the standard GMT format (hence the speed). If you have more complex format and have to go through format, you can use pvec from multicore/parallel to at least use all cores of your machine.

Cheers,
Simon
 Perhaps this already exists somewhere
and I am not aware of it.

-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com] 
Sent: Monday, July 23, 2012 12:00 PM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - Possible Bug
in R 2.15.1 64-bit Ubuntu

David,

Thank you for providing something reproducible.

This line:
templateTimes <- as.xts(templateTimes)

creates a zero-width xts object (i.e. the coredata is a zero-length vector,
but there is a non-zero-length index). So, the
to.period(templateTimes) call returns OHLC data of random memory locations.
This is the likely cause of the segfaults.

Since aggregating "no data" doesn't make sense, I have patched to.period to
throw an error when run on zero-width/length objects (revision 690 on
R-Forge).  The attached file works with the CRAN version of xts because it
avoids the issue entirely.

Your script will still "hang" on the BAC_0.csv file because
as.character.POSIXt can take a long time.  Better to just call
format() directly (as I do in the attached file).

If you have any follow-up questions, please send them to R-SIG-Finance.

Best,
--
Joshua Ulrich  |  about.me/joshuaulrich
FOSS Trading  |  www.fosstrading.com

On Mon, Jul 23, 2012 at 8:41 AM, David Terk <david.terk at gmail.com> wrote:
I'm attaching a runnable script and corresponding data files.  This 
will freeze at 83%.

I'm not sure how much simpler to get than this.

-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com]
Sent: Monday, July 23, 2012 9:17 AM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

Well, you still haven't convinced anyone but yourself that it's 
definitely an xts problem, since you have not provided any reproducible
example...
--
Joshua Ulrich  |  about.me/joshuaulrich FOSS Trading  |  
www.fosstrading.com

On Mon, Jul 23, 2012 at 8:14 AM, David Terk <david.terk at gmail.com> wrote:
Where should this be discussed since it is definitely XTS related?  I 
will gladly upload the simplified script + data files to whoever is 
maintaining this part of the code.  Fortunately there is a workaround
here.
-----Original Message-----
From: Joshua Ulrich [mailto:josh.m.ulrich at gmail.com]
Sent: Monday, July 23, 2012 8:15 AM
To: David Terk
Cc: Duncan Murdoch; r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

David,

You still haven't provided a reproducible example.  As Duncan already 
said, "if you don't post code that allows us to reproduce the crash, 
it's really unlikely that we'll be able to fix it."

And R-devel is not the appropriate venue to discuss this if it's 
truly an issue with xts/zoo.

Best,
--
Joshua Ulrich  |  about.me/joshuaulrich FOSS Trading  | 
www.fosstrading.com

On Mon, Jul 23, 2012 at 12:41 AM, David Terk <david.terk at gmail.com>
wrote:
Looks like the call to:

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

If what is causing the issue.  If variable name is not set, or set 
to any value other than NULL.  Than no hang occurs.

-----Original Message-----
From: David Terk [mailto:david.terk at gmail.com]
Sent: Monday, July 23, 2012 1:25 AM
To: 'Duncan Murdoch'
Cc: 'r-devel at r-project.org'
Subject: RE: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

I've isolated the bug.  When the seg fault was produced there was an 
error that memory had not been mapped.  Here is the odd part of the 
bug.  If you comment out certain code and get a full run than 
comment in
the code which
is causing the problem it will actually run.   So I think it is safe to
assume something wrong is taking place with memory allocation.  Example.
While testing, I have been able to get to a point where the code 
will
run.
But if I reboot the machine and try again, the code will not run.

The bug itself is happening somewhere in XTS or ZOO.  I will gladly 
upload the data files.  It is happening on the 10th data file which 
is only 225k lines in size.

Below is the simplified code.  The call to either

dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
index(dat.i) <- index(to.period(templateTimes, period=per, 
k=subper))

is what is causing R to hang or crash.  I have been able to 
replicate this on Windows 7 64 bit and Ubuntu 64 bit.  Seems easiest 
to consistently replicate from R Studio.

The code below will consistently replicate when the appropriate 
files are used.

parseTickDataFromDir = function(tickerDir, per, subper) {
 tickerAbsFilenames = list.files(tickerDir,full.names=T)
 tickerNames = list.files(tickerDir,full.names=F)
 tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)
 pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), 
style = 3)

 for(i in 1:length(tickerAbsFilenames)) {
   dat.i = parseTickData(tickerAbsFilenames[i])
   dates <- unique(substr(as.character(index(dat.i)), 1,10))
   times <- rep("09:30:00", length(dates))
   openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")
   templateTimes <- NULL

   for (j in 1:length(openDateTimes)) {
     if (is.null(templateTimes)) {
       templateTimes <- openDateTimes[j] + 0:23400
     } else {
       templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400)
     }
   }

   templateTimes <- as.xts(templateTimes)
   dat.i <- merge(dat.i, templateTimes, all=T)
   if (is.na(dat.i[1])) {
     dat.i[1] <- -1
   }
   dat.i <- na.locf(dat.i)
       dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)
       index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))
   setTxtProgressBar(pb, i)
 }
 close(pb)
}

parseTickData <- function(inputFile) {
 DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)
 index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")
 DAT.xts <- xts(DAT.list$Close,index)
 DAT.xts <- make.index.unique(DAT.xts)
 return(DAT.xts)
}

DATTick <- parseTickDataFromDir(tickerDirSecond, "seconds",10)

-----Original Message-----
From: Duncan Murdoch [mailto:murdoch.duncan at gmail.com]
Sent: Sunday, July 22, 2012 4:48 PM
To: David Terk
Cc: r-devel at r-project.org
Subject: Re: [Rd] Reading many large files causes R to crash - 
Possible Bug in R 2.15.1 64-bit Ubuntu

On 12-07-22 3:54 PM, David Terk wrote:
I am reading several hundred files.  Anywhere from 50k-400k in size.
It appears that when I read these files with R 2.15.1 the process 
will hang or seg fault on the scan() call.  This does not happen on 
R
2.14.1.
The code below doesn't do anything other than define a couple of
functions.
Please simplify it to code that creates a file (or multiple files), 
reads it or them, and shows a bug.

If you can't do that, then gradually add the rest of the stuff from 
these functions into the mix until you figure out what is really 
causing
the bug.
If you don't post code that allows us to reproduce the crash, it's 
really unlikely that we'll be able to fix it.

Duncan Murdoch

This is happening on the precise build of Ubuntu.

I have included everything, but the issue appears to be when 
performing the scan in the method parseTickData.

Below is the code.  Hopefully this is the right place to post.

parseTickDataFromDir = function(tickerDir, per, subper, fun) {

  tickerAbsFilenames = list.files(tickerDir,full.names=T)

  tickerNames = list.files(tickerDir,full.names=F)

  tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames)

  pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), 
style = 3)

  for(i in 1:length(tickerAbsFilenames)) {

    # Grab Raw Tick Data

    dat.i = parseTickData(tickerAbsFilenames[i])

    #Sys.sleep(1)

    # Create Template

    dates <- unique(substr(as.character(index(dat.i)), 1,10))

    times <- rep("09:30:00", length(dates))

    openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S")

    templateTimes <- NULL

    for (j in 1:length(openDateTimes)) {

      if (is.null(templateTimes)) {

        templateTimes <- openDateTimes[j] + 0:23400

      } else {

        templateTimes <- c(templateTimes, openDateTimes[j] +
0:23400)

      }

    }

    # Convert templateTimes to XTS, merge with data and convert 
NA's

    templateTimes <- as.xts(templateTimes)

    dat.i <- merge(dat.i, templateTimes, all=T)

    # If there is no data in the first print, we will have leading 
NA's.  So set them to -1.

    # Since we do not want these values removed by to.period

    if (is.na(dat.i[1])) {

      dat.i[1] <- -1

    }

    # Fix remaining NA's

    dat.i <- na.locf(dat.i)

    # Convert to desired bucket size

    dat.i <- to.period(dat.i, period=per, k=subper, name=NULL)

    # Always use templated index, otherwise merge fails with other 
symbols

    index(dat.i) <- index(to.period(templateTimes, period=per,
k=subper))

    # If there was missing data at open, set close to NA

    valsToChange <- which(dat.i[,"Open"] == -1)

    if (length(valsToChange) != 0) {

      dat.i[valsToChange, "Close"] <- NA

    }

    if(i == 1) {

      DAT = fun(dat.i)

    } else {

      DAT = merge(DAT,fun(dat.i))

    }

    setTxtProgressBar(pb, i)

  }

  close(pb)

  colnames(DAT) = tickerNames

  return(DAT)

}

parseTickData <- function(inputFile) {

  DAT.list <- scan(file=inputFile,
sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T)

  index <-
as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y
%H:%M:%S")

  DAT.xts <- xts(DAT.list$Close,index)

  DAT.xts <- make.index.unique(DAT.xts)

  return(DAT.xts)

}

     [[alternative HTML version deleted]]

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

______________________________________________
R-devel at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel