# SL96 functions related to whole-genome prediction (WGP).

# "WGP" constructs WGP models.
# Arguments
#	GenoFile: Character string of genotype file name (e.g. "SL96axiom.txt").
#	PhenoFile: Character string of phenotype file name (e.g. "SL96ws2015.txt").
#	Method: "GBLUP", "RKHS", "RF"
#	Trait: Character string(s) of trait name to be analyzed (e.g. "SSC", c("SSC", "TFW")).
# Value
#	Binary file for WGP.
	WGP <- function(GenoFile, PhenoFile, Method, Trait=NULL) {
		Methods <- c("GBLUP","BL","EBL","wBSR","BayesC","RKHS","RF")
		if (!is.element(Method, Methods)) { stop("Invalid method.") }
		if (Method=="GBLUP") { WGPrr(GenoFile, PhenoFile, Method, Trait) }
		if (Method=="RKHS") { WGPrr(GenoFile, PhenoFile, Method, Trait) }
		if (Method=="RF") { WGPrf(GenoFile, PhenoFile, n.tree=500, Trait) }		
		}

# "WGPrr" constructs WGP models using functions in rrBLUP version 4.3
#	Reference: Endelman(2011) doi:10.3835/plantgenome2011.08.0024
# Arguments
#	GenoFile: Character string of genotype file name (e.g. "SL96axiom.txt").
#	PhenoFile: Character string of phenotype file name (e.g. "SL96ws2015.txt").
#	Method: "RR" for Ridge regression. "GAUSS" for Reproducing kernel Hilbert spacing regression.
#	Trait: Character string(s) of trait name to be analyzed (e.g. "SSC", c("SSC", "TFW")).
	WGPrr <- function(GenoFile, PhenoFile, Method, Trait=NULL) {
	# check genotype data format
		In <- read.table(GenoFile)
		r <- checkGenoData(In)
		if (r==1) { stop("Genotype format error.") }
		ggid <- unique(udac(In[1,5:ncol(In)]))
	# check phenotype data format
		In <- read.table(PhenoFile, header=TRUE)
		r <- checkPhenoData(In, Trait)
		if (r==1) { stop("Phenotype format error.") }
		pgid <- udac(In$gid)
	# check consistency between gentype and phenotype
		ans <- unique(c(is.element(ggid, pgid), is.element(pgid,ggid)))
		if (length(ans[ans==FALSE])==1) { stop("gid is inconsistent between phenotype and genotype.") }
	# import and reformat genotype data
		unphase(GenoFile, "tent.txt")
		Data <- read.table("tent.txt", header=TRUE)
		system("rm tent.txt")
		G <- Data[,5:ncol(Data)]
		G <- t(G)
		GenoFileName <- unlist(strsplit(GenoFile,".txt"))
	# import phenotype data
		Y <- read.table(PhenoFile, header=TRUE)
		PhenoFileName <- unlist(strsplit(PhenoFile,".txt"))
	# construct WGP models
		if (length(Trait)==0) Trait <- colnames(Y)[2:ncol(Y)]
		for (k in 1:length(Trait)) {
			wgp <- vector(mode="list", length=3)
			wgp[[1]] <- udan(Y[Trait[k]])
			wgp[[2]] <- G
			wgp[[3]] <- Method
			filename <- paste(GenoFileName, PhenoFileName, Trait[k], Method, sep="_")
			binname <- paste(filename, ".bin", sep="")
			save(wgp, file=binname)
			}
		}

# "WGPrf" constructs WGP models using functions in randomForest.
# Arguments
#	GenoFile: Character string of genotype file name (e.g. "SL96axiom.txt").
#	PhenoFile: Character string of phenotype file name (e.g. "SL96ws2015.txt").
#	Trait: Character string(s) of trait name to be analyzed (e.g. "SSC", c("SSC", "TFW")).
#	n.tree: Number for rondome forest tree.
	WGPrf <- function(GenoFile, PhenoFile, Trait=NULL, n.tree=500) {
		library("randomForest")
	# check genotype data format
		In <- read.table(GenoFile)
		r <- checkGenoData(In)
		if (r==1) { stop("Genotype format error.") }
		ggid <- unique(udac(In[1,5:ncol(In)]))
	# check phenotype data format
		In <- read.table(PhenoFile, header=TRUE)
		r <- checkPhenoData(In, Trait)
		if (r==1) { stop("Phenotype format error.") }
		pgid <- udac(In$gid)
	# check consistency between gentype and phenotype
		ans <- unique(c(is.element(ggid, pgid), is.element(pgid,ggid)))
		if (length(ans[ans==FALSE])==1) { stop("gid is inconsistent between phenotype and genotype.") }
	# import and reformat genotype data
		unphase(GenoFile, "tent.txt")
		Data <- read.table("tent.txt", header=TRUE)
		system("rm tent.txt")
		G <- Data[,5:ncol(Data)]
		G <- t(G)
		GenoFileName <- unlist(strsplit(GenoFile,".txt"))
	# import phenotype data
		Y <- read.table(PhenoFile, header=TRUE)
		PhenoFileName <- unlist(strsplit(PhenoFile,".txt"))
	# construct WGP models
		if (length(Trait)==0) Trait <- colnames(Y)[2:ncol(Y)]
		for (k in 1:length(Trait)) {
			y.train <- Y
			y.train <- udan(y.train[Trait[k]])
			use <- !is.na(y.train)
			y.train <- y.train[use]
			g.train <- G
			g.train <- g.train[use,]
			wgp <- randomForest(x=g.train, y=y.train, ntree=n.tree, importance=TRUE)
			filename <- paste(GenoFileName, PhenoFileName, Trait[k], "RF", sep="_")
			binname <- paste(filename, ".bin", sep="")
			save(wgp, file= binname)			
			}
		}

# "calcGEBV" calculates genomic estimated breeding values from whole-genome prediction (WGP) model files.
# Arguments
#	GenoFile: Character string of genotype file name to be predicted (e.g. "SL96axiom.txt").
#	ModelFile: Character string of WGP model file constructed by using function WGP (e.g. "SL96axiom_SL96ws2015_SSC_RR.bin").
# Value
#	Matrix of the result.
	calcGEBV <- function(GenoFile, ModelFile) {
		Method <- unlist(strsplit(ModelFile, "_"))[4]
		Method <- unlist(strsplit(Method, ".bin"))
		if (is.element(Method, c("GBLUP","RKHS"))) {
			library("rrBLUP")
			if (Method=="GBLUP") { Kmethod <- "RR" }
			if (Method=="RKHS") { Kmethod <- "GAUSS" }
			load(ModelFile)
			unphase(GenoFile, "tent.txt")
			Data <- read.table("tent.txt", header=TRUE)
			system("rm tent.txt")
			G <- Data[,5:ncol(Data)]
			G <- t(G)
			ans <- kinship.BLUP(wgp[[1]], wgp[[2]], G.pred=G, K.method=Kmethod)
			Result <- as.data.frame(matrix(NA, ncol=2, nrow=nrow(G)))
			colnames(Result) <- c("gid", "Yhat")
			Result$gid <- rownames(G)
			Result$Yhat <- udan(ans$g.pred) + udan(ans$beta)
			}
		if (Method=="RF") {
			load(ModelFile)
			library("randomForest")
			unphase(GenoFile, "tent.txt")
			x <- read.table("tent.txt", header=TRUE)
			system("rm tent.txt")
			G <- x[,5:ncol(x)]
			G <- t(G)
			gid1 <- rownames(G)
			gid2 <- labels(wgp$predicted)
			r <- c(which(!is.element(gid1,gid2)), which(!is.element(gid2,gid1)))
			if (length(r)==0) { Yhat <- udan(wgp$predicted); print("The predicted values are based on out-of-bag samples") }
			if (length(r)!=0) { Yhat <- predict(wgp, G) }
			Result <- as.data.frame(matrix(NA, ncol=2, nrow=nrow(G)))
			colnames(Result) <- c("gid", "Yhat")
			Result$gid <- rownames(G)
			Result$Yhat <- udan(Yhat)
			}
		#if (is.element(Method, c("BL","EBL","wBSR","BayesC")) { }
		Result
		}
