Skip to content

Reading Json data

1 message · Mark Sharp

#
Mayukh,

I apologize for taking so long to get back to your problem. I expect you may have found the solution. If so I would be interested. I have developed a hack to solve the problem, but I expect if someone knew how to handle JSON objects or even text parsing better they could develop a more elegant solution. 

As I understand the problem, your text file has more than one JSON object in text form. There are three. The first two are very similar and the last is a trailer indication what was done, when it was done and the number of JSON objects sent. The problem is that fromJSON() only pulls off the first of the JSON objects. 

I have defined three helper functions to separate the JSON objects, read them in, and store them in a list.

library(RJSONIO)
library(stringi, quietly = TRUE)
#library(jsonlite) # also works

#' Returns dataframe with ordered locations of the matching braces.
#'
#' There is almost certainly a better function to do this. 
#' @param txt character vector of length one having 0 or more matching braces.
#' @import stringi
#' @examples 
#' library(rmsutilityr)
#' match_braces("{123{456{78}9}10}")
#' @export
match_braces <- function(txt) {
  txt <- txt[1] # just in the case of having more than one element
  left <- stri_locate_all_regex(txt, "\\{")[[1]][ , 1]
  right <- stri_locate_all_regex(txt, "\\}")[[1]][ , 2]
  len <- length(left)
  braces <- data.frame(left = rep(0, len), right = rep(0, len))
  for (i in seq_along(right)) {
    for (j in rev(seq_along(left))) {
      if (left[j] < right[i] & left[j] != 0) {
        braces$left[i] <- left[j]
        braces$right[i] <- right[i]
        left[j] <- 0
        break
      }
    }
  }
  braces[order(braces$left), ]
}

#' Returns a list containing two objects in the text of a character vector 
#' of length one: (1) object = the first json object found and (2) remainder = 
#' the remaining text.
#' 
#'  Properly formed messages are assumed. Error checking is non-existent.
#' @param json_txt character vector of length one having one or more JSON
#' objects in character form.
#' @import stringi
#' @export
get_first_json_message <- function(json_txt) {
  len <- stri_length(json_txt)
  braces <- match_braces(json_txt)
  if (braces$right[1] + 1 > len) {
    remainder <- ""
  } else {
    remainder <- stri_trim_both(stri_sub(json_txt, braces$right[1] + 1))
  }
  list(object = stri_sub(json_txt, braces$left[1], to = braces$right[1]),
       remainder = remainder)
}
#' Returns list of lists made by call to fromJSON()

#' @param json_txt character vector of length 1 having one or more
#' JSON objects in text form.
#' @import stringi
#' @export
get_json_list <- function (json_txt) {
  t_json_txt <- json_txt
  i <- 0
  json_list <- list()
  repeat{
    i <- i + 1
    message_remainder <- get_first_json_message(t_json_txt)
    json_list[i] <- list(fromJSON(message_remainder$object))
    if (message_remainder$remainder == "")
      break
    t_json_txt <- message_remainder$remainder
  }
  json_list
}

json_file <- "../data/json_file.txt"
json_txt <- stri_trim_both(stri_c(readLines(json_file), collapse = " "))
json_list <- get_json_list(json_txt)
length(json_list)


R. Mark Sharp, Ph.D.
Director of Primate Records Database
Southwest National Primate Research Center
Texas Biomedical Research Institute
P.O. Box 760549
San Antonio, TX 78245-0549
Telephone: (210)258-9476
e-mail: msharp at TxBiomed.org