Cookie Notice

As far as I know, and as far as I remember, nothing in this page does anything with Cookies.

2017/02/28

Having Problems Munging Data in R

#!/group/bioinfo/apps/apps/R-3.1.2/bin/Rscript

# a blog post in code-and-comment form

# Between having some problems with our VMs and wanting 
# to learn Log::Log4perl. I wrote a program that took 
# the load average -- at first at the hour, via 
# crontab -- and stored the value. And, if the load 
# average was > 20, it would send me an alert

# It used to be a problem. It is no longer. Now I 
# just want to learn how to munge data in R

# read in file
logfile = read.table('~/.uptime.log')

# The logfile looks like this:
#
#   2017/01/01 00:02:01 genomics-test : 0.36 0.09 0.03
#   2017/01/01 00:02:02 genomics : 0.04 0.03 0.04
#   2017/01/01 00:02:02 genomics-db : 0.12 0.05 0.01
#   2017/01/01 00:02:04 genomics-apps : 1.87 1.24 0.79
#   2017/01/01 01:02:02 genomics-db : 0.24 0.14 0.05
#   2017/01/01 01:02:02 genomics-test : 0.53 0.14 0.04
#   2017/01/01 01:02:03 genomics : 0.13 0.09 0.08
#   2017/01/01 01:02:04 genomics-apps : 1.66 1.82 1.58
#   2017/01/01 02:02:01 genomics-test : 0.15 0.03 0.01
#   ...

# set column names
colnames(logfile)=c('date','time','host','colon','load','x','y')

# now:
#
#   date       time     host         colon load x y
#   2017/01/01 00:02:01 genomics-test : 0.36 0.09 0.03
#   2017/01/01 00:02:02 genomics : 0.04 0.03 0.04

logfile$datetime <- paste( as.character(logfile$date) , as.character(logfile$time) )
# datetime == 'YYYY/MM/DD HH:MM:SS'
logfile$datetime <- sub('......$','',logfile$datetime)
# datetime == 'YYYY/MM/DD HH'
logfile$datetime <- sub('/','',logfile$datetime)
# datetime == 'YYYYMM/DD HH'
logfile$datetime <- sub('/','',logfile$datetime)
# datetime == 'YYYYMMDD HH'
logfile$datetime <- sub(' ','',logfile$datetime)
# datetime == 'YYYYMMDDHH'

# for every datetime in logfile. I love clean data

# removes several columns we no longer need

logfile$time    <- NULL
logfile$date    <- NULL
logfile$colon   <- NULL
logfile$x       <- NULL
logfile$y       <- NULL

# logfile now looks like this:
#
#   datetime  host             load
#   2017010100 genomics-test    0.36 
#   2017010100 genomics         0.04 
#   2017010100 genomics-db      0.12 
#   2017010100 genomics-apps    1.87 
#   2017010101 genomics-db      0.24 
#   2017010101 genomics-test    0.53 
#   2017010101 genomics         0.13 
#   2017010101 genomics-apps    1.66 
#   2017010102 genomics-test    0.15 
#   ...

# and we can get the X and Y for a big huge replacement table
hosts <- unique(logfile$host[order(logfile$host)])
dates <- unique(logfile$datetime)

# because what we want is something closer to this
#
#   datetime        genomics    genomics-apps   genomics-db     genomics-test
#   2017010100      0.04        1.87            0.12            0.36
#   2017010101      0.13        1.66            0.15            0.53
#   ...

# let's try to put it into a dataframe

uptime.data <- data.frame()
uptime.data$datetime <- vector() ;
for ( h in hosts ) {
    uptime.data[h] <- vector()
    } 

# and here, we have a data frame that looks like 
#
#   datetime        genomics    genomics-apps   genomics-db     genomics-test
#
# as I understand it, you can only append to a data frame by merging.
# I need to create a data frame that looks like
#
#   datetime        genomics    genomics-apps   genomics-db     genomics-test
#   2017010100      0.04        1.87            0.12            0.36
#
# and then merge that. Then do the same with 
#
#   datetime        genomics    genomics-apps   genomics-db     genomics-test
#   2017010101      0.13        1.66            0.15            0.53
#
# and so on.
#
# I don't know how to do that. 
#
# I *think* the way is make a one-vector data frame:
#
#   datetime        
#   2017010101      
#
# and add the vectors one at a time.

for ( d in dates ) {

    # we don't and the whole log here. we just want 
    # this hour's data
    # 
    #   datetime  host             load
    #   2017010100 genomics-test    0.36 
    #   2017010100 genomics         0.04 
    #   2017010100 genomics-db      0.12 
    #   2017010100 genomics-apps    1.87 
    log <- subset(logfile, datetime==d)

    print(d)

    for ( h in hosts ) {
        # and we can narrow it down further
        # 
        #   datetime  host             load
        #   2017010100 genomics         0.04 
        hostv <- subset(log,host==h)
        load = hostv$load 
        # problem is, due to fun LDAP issues, sometimes 
        # the logging doesn't happen
        if ( 0 == length(load) ) { load <- -1 }
        print(paste(h, load ))
    }

    # and here's where I'm hung. I can get all the pieces 
    # I want, even -1 for missing values, but I can't seem  
    # to put it together into a one-row data frame
    # to append to uptime.data. 

    #   [1] "2017010100"
    #   [1] "genomics 0.04"
    #   [1] "genomics-apps 1.87"
    #   [1] "genomics-db 0.12"
    #   [1] "genomics-test 0.36"
    #   [1] "2017010101"
    #   [1] "genomics 0.13"
    #   [1] "genomics-apps 1.66"
    #   [1] "genomics-db 0.24"
    #   [1] "genomics-test 0.53"
    #   [1] "2017010102"
    #   [1] "genomics 0.36"
    #   [1] "genomics-apps 0.71"
    #   [1] "genomics-db 0.08"
    #   [1] "genomics-test 0.15"

}