# R script for transforming Rowan Lockwood's original data into the two # tables that are included in the data registry. The first column of her # original dataset gives the taxon number, and columns 2-32 contain # taxon-level information. The remaining columns represent three batches # of variables (X, PX, and RX), each with 261 columns corresponding to # different locations; the location identifier is embedded in the column # name, as in "X.4100". # # The script extracts the taxon-level column into a separate 'traits' # table, and converts the X/RX/PX variable columns into a long-formatted # table like so: # taxon location X RX PX # # The two output tables can be re-joined using the "Number" field (i.e., # the taxon identifier). # # Jim Regetz # Created: 18-Jun-2008 # Read in data lockwood <- read.csv("LockwoodKT.csv", na.strings=NULL) # 'reshape' package is required for melt() function library(reshape) # Define custom function for converting a particular subset of Rowan's # dataframe to long format, based on header parsing submelt <- function(dframe, variable) { regexp <- paste("^", variable, "\\.*", sep="") melted <- melt(dframe[c(1, grep(regexp, names(dframe)))], id.var=1, variable_name="Location") melted$Location <- gsub(regexp, "", melted$Location) names(melted)[names(melted)=="value"] <- variable return(melted) } # Convert X, RX, and PX variables to long format, and combine them into # a single table with rows referenced by taxon number X.long <- submelt(lockwood, "X") PX.long <- submelt(lockwood, "PX") RX.long <- submelt(lockwood, "RX") abundance <- merge(merge(X.long, PX.long, sort=FALSE), RX.long, sort=FALSE) abundance$Location <- as.numeric(abundance$Location) # Now make a second table for the taxon-level data traits <- lockwood[1:32] # Write both tables to CSV files. The na argument is used to ensure that # any values treated as NA by R will be written out as blanks (as in the # original dataset). write.csv(traits, "lockwood_traits.csv", na="", row.names=FALSE) write.csv(abundance, "lockwood_abundance.csv", na="", row.names=FALSE)