# SL96 functions related to cross-validation (CV).

# "CV" performs CV.
# Arguments
#	GenoFile: Character string of genotype file name (e.g. "SL96axiom.txt").
#	PhenoFile: Character string of phenotype file name (e.g. "SL96ws2015.txt").
#	Method: "GBLUP", "RKHS", "RF"
#	Trait: Character string(s) of trait name to be analyzed (e.g. "SSC", c("SSC", "TFW")).
# Value
#	Text file of the result.
	CV <- function(GenoFile, PhenoFile, Method, Fold=10, Repeat=1, Seed=NULL, Trait=NULL) {
		Methods <- c("GBLUP","BL","EBL","wBSR","BayesC","RKHS","RF")
		if (!is.element(Method, Methods)) { stop("Invalid method.") }
		if (Method=="GBLUP") { CVrr(GenoFile, PhenoFile, Method, Fold, Repeat, Seed, Trait) }
		if (Method=="RKHS") { CVrr(GenoFile, PhenoFile, Method, Fold, Repeat, Seed, Trait) }
		if (Method=="RF") { CVrf(GenoFile, PhenoFile, Fold, Repeat, Seed, Trait, n.tree=500) }
		}

# "makePartition" constructs patterns of partition for CV.
# Arguments
#	N.lines: Number of lines in training population.
#	Fold: Number of fold for cross-validation.
#	Seed: Random seed number.
# Value
#	Matrix of pattern of partition.
	makePartition <- function(N.lines, Fold, Seed) {
		n.pred <- ceiling(N.lines/Fold)
		n.fill <- Fold*n.pred - N.lines
		x <- seq(1,N.lines,by=1)
		set.seed(Seed)
		x <- c(sample(x, length(x)), rep(-9, n.fill))
		matrix(x, ncol=Fold, byrow=TRUE)
		}

# "CVrr" performs CV of WGP models using functions in rrBLUP.
# Arguments
#	GenoFile: Character string of genotype file name (e.g. "SL96axiom.txt").
#	PhenoFile: Character string of phenotype file name (e.g. "SL96ws2015.txt").
#	Method: "RR" for Ridge regression. "GAUSS" for Reproducing kernel Hilbert spacing regression.
#	Fold: Number of folds.
#	Repeat: Number of repeats.
#	Seed: Random seed number.
#	Trait: Character string(s) of trait name to be analyzed (e.g. "SSC", c("SSC", "TFW")).
# Value
#	Text file of the result.
	CVrr <- function(GenoFile, PhenoFile, Method, Fold, Repeat, Seed, Trait) {
		library("rrBLUP")
		if (Method=="GBLUP") { Kmethod <- "RR" }
		if (Method=="RKHS") { Kmethod <- "GAUSS" }
	# check genotype data format
		In <- read.table(GenoFile)
		r <- checkGenoData(In)
		if (r==1) { stop("Genotype format error.") }
		ggid <- unique(udac(In[1,5:ncol(In)]))
	# check phenotype data format
		In <- read.table(PhenoFile, header=TRUE)
		r <- checkPhenoData(In, Trait)
		if (r==1) { stop("Phenotype format error.") }
		pgid <- udac(In$gid)
	# check consistency between gentype and phenotype
		ans <- unique(c(is.element(ggid, pgid), is.element(pgid,ggid)))
		if (length(ans[ans==FALSE])==1) { stop("gid is inconsistent between phenotype and genotype.") }
	# import and reformat genotype data
		unphase(GenoFile, "tent.txt")
		Data <- read.table("tent.txt", header=TRUE)
		system("rm tent.txt")
		G <- Data[,5:ncol(Data)]
		G <- t(G)
		GenoFileName <- unlist(strsplit(GenoFile,".txt"))
	# import phenotype data
		Y <- read.table(PhenoFile, header=TRUE)
		if (Fold > nrow(Y)) { stop("Invalid fold.") }
		PhenoFileName <- unlist(strsplit(PhenoFile,".txt"))
	# perform cross-validation
		if (is.null(Seed)) { Seed <- udan(gsub(":","",unlist(strsplit(udac(Sys.time())," "))[2])) }
		if (is.null(Trait)) Trait <- colnames(Y)[2:ncol(Y)]
		Results <- as.data.frame(matrix(NA, nrow=Repeat, ncol=length(Trait)))
		colnames(Results) <- Trait
		for (r in 1:Repeat) {
			Partition <- makePartition(nrow(Y), Fold, Seed+r)
			for (k in 1:length(Trait)) {
				result <- c()
				for (cv in 1:ncol(Partition)) {
					test <- sort(Partition[,cv])
					test <- test[test!=-9]
					code <- seq(1,nrow(Y))
					y.train <- Y[!is.element(code, test),]
					y.pred <- Y[is.element(code, test),]
					g.train <- G[!is.element(code, test),]
					g.pred <- G[is.element(code, test),]
					if (is.vector(g.pred)) { g.pred <- t(g.pred) }
					ans <- kinship.BLUP(udan(y.train[Trait[k]]), G.train=g.train, G.pred=g.pred, K.method=Kmethod)
					w <- cbind(test, ans$g.pred, y.pred[Trait[k]])
					colnames(w) <- c("Test", "BV", "Y")
					result <- rbind(result, w)
					}
				Results[Trait[k]][r,] <- cor(result$BV, result$Y)
				}
			}
		result <- as.data.frame(result)
		Results <- as.matrix(Results)	
		if (Repeat >= 3) {
			Results <- rbind(apply(Results, 2, mean), apply(Results, 2, sd))
			rownames(Results) <- c("mean", "sd")
			}
		filename <- paste(GenoFileName, PhenoFileName, paste(Fold,"CV",Repeat, sep=""), Method, Seed, sep="_")
		txtname <- paste(filename, ".csv", sep="")
		write.csv(Results, txtname, quote=FALSE)
		}

# "CVrf" performs cross-validation of whole-genome prediction (WGP) models using random forest.
# Arguments
#	GenoFile: Character string of genotype file name (e.g. "SL96axiom.txt").
#	PhenoFile: Character string of phenotype file name (e.g. "SL96ws2015.txt").
#	Fold: Number of folds.
#	Repeat: Number of repeats.
#	Seed: Random seed number.
#	Trait: Character string(s) of trait name to be analyzed (e.g. "SSC", c("SSC", "TFW")).
#	n.tree: Number for rondome forest tree.
# Value
#	Text file of the result.
	CVrf <- function(GenoFile, PhenoFile, Fold, Repeat, Seed, Trait, n.tree=500) {
		library("randomForest")
	# check genotype data format
		In <- read.table(GenoFile)
		r <- checkGenoData(In)
		if (r==1) { stop("Genotype format error.") }
		ggid <- unique(udac(In[1,5:ncol(In)]))
	# check phenotype data format
		In <- read.table(PhenoFile, header=TRUE)
		r <- checkPhenoData(In, Trait)
		if (r==1) { stop("Phenotype format error.") }
		pgid <- udac(In$gid)
	# check consistency between gentype and phenotype
		ans <- unique(c(is.element(ggid, pgid), is.element(pgid,ggid)))
		if (length(ans[ans==FALSE])==1) { stop("gid is inconsistent between phenotype and genotype.") }
	# import and reformat genotype data
		unphase(GenoFile, "tent.txt")
		Data <- read.table("tent.txt", header=TRUE)
		system("rm tent.txt")
		G <- Data[,5:ncol(Data)]
		G <- t(G)
		GenoFileName <- unlist(strsplit(GenoFile,".txt"))
	# import phenotype data
		Y <- read.table(PhenoFile, header=TRUE)
		if (Fold > nrow(Y)) { stop("Invalid fold.") }
		PhenoFileName <- unlist(strsplit(PhenoFile,".txt"))
	# perform cross-validation
		if (is.null(Seed)) { Seed <- udan(gsub(":","",unlist(strsplit(udac(Sys.time())," "))[2])) }
		if (is.null(Trait)) Trait <- colnames(Y)[2:ncol(Y)]
		Results <- as.data.frame(matrix(NA, nrow=Repeat, ncol=length(Trait)))
		colnames(Results) <- Trait
		for (r in 1:Repeat) {
			Partition <- makePartition(nrow(Y), Fold, Seed+r)
			for (k in 1:length(Trait)) {
				result <- c()
				for (cv in 1:ncol(Partition)) {
					test <- sort(Partition[,cv])
					test <- test[test!=-9]
					code <- seq(1,nrow(Y))
					y.train <- Y[!is.element(code, test),]
					y.train <- udan(y.train[Trait[k]])
					use <- !is.na(y.train)
					y.train <- y.train[use]
					y.pred <- Y[is.element(code, test),]
					y.pred <- udan(y.pred[Trait[k]])
					g.train <- G[!is.element(code, test),]
					g.train <- g.train[use,]
					g.pred <- G[is.element(code, test),]
					if (is.vector(g.pred)) { g.pred <- t(g.pred) }

					ans <- randomForest(x=g.train, y=y.train, ntree=n.tree, importance=TRUE)
					y.est <- predict(ans, g.pred)
					w <- cbind(test, y.est, y.pred)
					colnames(w) <- c("Test", "BV", "Y")
					result <- rbind(result, w)
					}
				result <- as.data.frame(result)

				Results[Trait[k]][r,] <- cor(result$BV, result$Y)
				}
			}
		Results <- as.matrix(Results)	
		if (Repeat >= 3) {
			Results <- rbind(apply(Results, 2, mean), apply(Results, 2, sd))
			rownames(Results) <- c("mean", "sd")
			}
		filename <- paste(GenoFileName, PhenoFileName, paste(Fold,"CV",Repeat, sep=""), "RF", Seed, sep="_")
		txtname <- paste(filename, ".csv", sep="")
		write.csv(Results, txtname, quote=FALSE)
		}
