Methods used to transform the MS Excel spreadsheet into a comma delineated file
The program called "R" was used to combine the data from the individual bubble trap sections of the spreadsheet.
#
# R script for restructuring a Walter Lake methane dataset associated
# with the CH4 Emissions working group (NCEAS 10645: Zhuang). The
# restructuring is performed in accordance with a set of rules developed
# by Callie Bowdish.
#
# Note that this script standardizes column names, but does *not* fully
# clean up the data. For example, there are some character strings (e.g,
# "START", "#VALUE!") in columns that should probably only have numeric
# values. These are simply saved back out exactly as they were read in.
#
# Also note that values of "." (along with blank fields in numeric
# columns) in the original data are treated as NA, and subsequently
# written back out as blank fields. This behavior could be changed by
# removing the na.strings argument from the read.csv function.
#
# Author: Jim Regetz
# Date created: 18-Mar-2008
# NCEAS
#
# Read in the first 10319 rows of data (not including header). Don't
# change column names, and treat "." as NA
walter <- read.csv("Walter_Lake_ebullition_fluxMat.csv", nrow=10319,
na.strings = ".", check.names = FALSE, stringsAsFactors=FALSE)
# Clean up several column names for consistency
names(walter)[names(walter)=="Lake"] <- "lake"
names(walter)[grep("(mg CH4 m-2 d-1)", names(walter))] <-
"(mg_CH4_m-2_d-1)"
names(walter)[grep("(mgCH4_spot-1_d-1)", names(walter))] <-
"(mg_CH4_spot-1_d-1)"
names(walter)[grep("(mgCH4 spot-1 d-1)", names(walter))] <-
"(mg_CH4_spot-1_d-1)"
# Drop two empty columns (they're the ones without header names)
walter <- walter[, names(walter)!=""]
# Specify the desired order of the repeated columns
SORTORDER <- c("Bubble_Trap_No", "lake", "Bubble_Type",
"water_depth_(m)", "lake_section", "(mg_CH4_spot-1_d-1)",
"(mg_CH4_m-2_d-1)")
# Determine starting and ending col# for each batch of data
colIDs_start <- grep("Bubble_Trap", names(walter))
colIDs_end <- c(colIDs_start[-1] - 1, ncol(walter))
# Define a function for extracting a batch of columns from dframe,
# standardizing column names, and adding empty columns if any are
# missing.
extractBatch <- function(dframe, start, end, ord) {
batch <- dframe[start:end]
names(batch) <- gsub("^[[:digit:]]*_", "", names(batch))
names(batch) <- gsub("\\.[[:digit:]]*$", "", names(batch))
# If missing, create 'lake_section' and 'water_depth(m)' columns
if (!"lake_section" %in% names(batch)) {
batch$`lake_section` <- rep(NA, nrow(batch))
}
if (!"water_depth_(m)" %in% names(batch)) {
batch$`water_depth_(m)` <- rep(NA, nrow(batch))
}
# Add the missing CH4 column (or give error if both are missing)
if (!"(mg_CH4_m-2_d-1)" %in% names(batch)) {
batch$`(mg_CH4_m-2_d-1)` <- rep(NA, nrow(batch))
} else if (!"(mg_CH4_spot-1_d-1)" %in% names(batch)) {
batch$`(mg_CH4_spot-1_d-1)` <- rep(NA, nrow(batch))
} else {
stop("No CH4 column found")
}
# Re-sort columns and combine with first 5 columns of dataset
cbind(dframe[1:5], batch[ord])
}
# Break all the data into batches, using the function defined above
batches <- mapply(extractBatch, start=colIDs_start, end=colIDs_end,
MoreArgs=list(dframe=walter, ord=SORTORDER), SIMPLIFY=FALSE)
# Combine all the batches (by rows) into a big dataframe
final <- do.call("rbind", batches)
# Save as CSV, replacing NAs with blanks
write.csv(final, file="Walter_Lake_ebullition_fluxMat_FIXED.csv",
row.names=FALSE, na="", quote=FALSE)