library(slam);
library(som);
library(textir);
library(hash);
library(reshape);
library(Matrix);

#Calculates loadings using a) correlations b)OLS c)mnlm
#uses only house, and trims numbers from voacb
#also does loading on South


##Congress you want to look at: 43-111
gramSize = 3
gramStr = toString(gramSize)
gramName = ""
if(gramSize==1){gramName="Uni"}
if(gramSize==2){gramName="Bi"}
if(gramSize==3){gramName="Tri"}

vLimit= 10000
subsetName = paste("rawdata\\googleSubset",gramStr,".txt",sep = "")
googSub = read.table(subsetName);
gLen = dim(googSub)[1]
googleFreqOrd = order(googSub[,3])
googOrdered = googSub[googleFreqOrd,]
googVoc = as.matrix(googOrdered[(gLen-vLimit):(gLen-1),1])
googHash = hash(googOrdered[2:gLen,1],2:(gLen))
googHash["TOTALWORDS:"] <- 0

##Congresses range from 43 to 111
congInt<-1 #Keep at 1 for now; program not suitable for combinations of congresses
congStart<-43
congEnd =min(congStart+congInt-1, 111)
congStart;
while (congStart<=111){
	dimiszero = as.integer(0)
	countsAll = new("dgTMatrix",Dim=c(dimiszero,dimiszero))
	DWAll = matrix(nrow = 0, ncol = 0)
	for(congNo in congStart:congEnd){
		congStr = ""  
		if (congNo < 100) {congStr = paste("0",toString(congNo),sep = "")
		}else{congStr = toString(congNo)}
		##Load data. It will take a few moments
		C = read.table(paste("rawdata\\outputData",gramName,"\\cong",congStr,"Speech",gramName,".dat", sep = ""));
		##Full vocab for all congresses. About 1,000,000 words
		##You can compare differedant congresses using the same vocab
		##But the vocab file for each given congress is ordered differently, 
		##by frequency of use in that congress, so be careful
		vocab = read.table(paste("rawdata\\outputData",gramName,"\\cong",congStr,"Word",gramName,".dat", sep = ""),header=TRUE);
		if(gramSize==1){
			aList =grep("(nécessair)|(générale)|(ﬁnancial)",vocab[,1])
			vocab = as.matrix(vocab[-aList,])
		}
		vLen = dim(vocab)[1]
		DW = read.table(paste("rawdata\\outputData",gramName,"\\cong",congStr,"DW",gramName,".dat", sep = ""),header = TRUE);
		legLen = dim(DW)[1]
		counts = new("dgTMatrix",i=as.integer(array(C[,1])),j=as.integer(array(C[,2])),x=as.numeric(array(C[,3])),Dim=as.integer(c(legLen,vLen)))
	      ##counts = counts[,(dim(counts)[2]-vLen+1):dim(counts)[2]]
		#restrict to house
		##DWadj = DW
		##DWadj[,6] = DW[,6]

		legNames <- DW[,1]
		DW = as.matrix(DW)
		rownames(DW) <- legNames
		rownames(counts) <- legNames

		#Correct DW with chambers
		#chambersCong = chambers[chambers[,2]==congNo,]
		#rownames(chambersCong) = chambersCong[,1]
		#nameSet = as.matrix(DW[,1])
		#correctedChambers = chambersCong[nameSet,3]
		#lookup=hash(chambersCong[,1],1:dim(chambersCong)[1])
		#nameInLookup = has.key(nameSet,lookup)
		#nameSubset = as.matrix(nameSet[nameInLookup,])
		#correctedChambers = chambersCong[values(lookup,nameSubset),3]
		#DW[nameSubset[,1],] = 100
		
		counts<-counts[DW[,4]=="H",]
		write.table(t(c(congStart,dim(counts)[1])),paste("rawdata\\numpersons.csv"),sep=",",append=T, col.names=F)


		DW<-as.matrix(DW[DW[,4]=="H",])

		##colnames(counts) <- vocab[,1]


		##number of words in vocab
		vLen = dim(vocab)[1]

		##Number of words (chosen from most frequent) we regress over. 
		##A given congress will have less 
		##than 100,000 unique words, most at low frequency
		vocInGoog = has.key(as.matrix(vocab[,1]),googHash)
		vocabGoogSubset = as.matrix(vocab[vocInGoog,1])
		countsGoogSubset = counts[,vocInGoog]
		gf = googOrdered[1:gLen,]
		gfVals = as.matrix(values(googHash,vocabGoogSubset))
		##gfTemp = gf[order(gfVals[,1]),]
		##First, we trim the set of words to only those with 
		##over 2K freq in google ngrams for trigrams, 10K for bigrams
		if(gramSize == 1){
			intSet = as.matrix(sapply(gfVals,is.integer))
			gfVals = gfVals[intSet[,1]]
			gfVals = unlist(gfVals)
			freqThresh = 50000
		}
		if(gramSize == 2){ freqThresh = 10000}
		if(gramSize == 3){ freqThresh = 2000}
		if(gramSize > 1){
			gfSub = (1:vLen)[googOrdered[gfVals,3] > freqThresh]
			countsSub = counts[,gfSub]
			vocabSub <- vocab[gfSub,]
		}
		if(gramSize==1){
			vLen = dim(vocabGoogSubset)[1]
			gfSub = (1:vLen)[googOrdered[gfVals,3] > freqThresh]
			countsSub = countsGoogSubset[,gfSub]
			vocabSub <- vocabGoogSubset[gfSub,]
		}

		sLen = dim(countsSub)[2]
		vocabSub[(sLen-100):sLen]
		allrows <- union(rownames(countsAll),rownames(countsSub))
		rLen <- length(allrows)
		d1 = as.integer(rLen)
		d2 = as.integer(sLen)
		countsTemp <- new("dgTMatrix", Dim = c(d1,d2))
		rownames(countsTemp)<-allrows
		colnames(countsTemp)<-vocabSub
		if(congNo>congStart){
			countsTemp[rownames(countsAll),] <- countsAll
		}
		countsTemp[rownames(countsSub),] <- countsTemp[rownames(countsSub),]+countsSub
		countsAll <- new("dgCMatrix",countsTemp)
		rownames(countsAll) <- rownames(countsTemp)

		dwLen = dim(DW)[2]
		DWTemp <- matrix(0,nrow=rLen,ncol=dwLen)
		rownames(DWTemp) <- allrows
		DWTemp[rownames(DWAll),] = DWAll
		DWTemp[rownames(DW),] = DW
		rownames(DWTemp) <- allrows
		DWAll = DWTemp
	
	}
	##Takes summed congress speech counts and orders by word frequency.
	DWUnionParty<-(DWAll[,2]=="D")*-1 +(DWAll[,2]=="R")
	countsSummed = as.simple_triplet_matrix(as(countsAll,"dgTMatrix"))
	DWBorderSouth<-(DWAll[,3]=="DE" | DWAll[,3] =="MD" | DWAll[,3]=="VA" | DWAll[,3]=="WV" | DWAll[,3] =="KY")
	DWDeepSouth<-(DWAll[,3]=="NC" | DWAll[,3] =="TN" | DWAll[,3]=="SC" | DWAll[,3]=="GA" | DWAll[,3] =="FL"| DWAll[,3]=="MS" | DWAll[,3] =="TX"| DWAll[,3] =="OK"| DWAll[,3]=="AR" | DWAll[,3] =="LA") 
	DWSouth=DWBorderSouth|DWDeepSouth
	DWUnion = matrix(c(as.double(DWAll[,6:7]),DWSouth,DWUnionParty),rLen,4)

	#Eliminated WCO ordering; want to preserve chi2 order
	#DWUnionParty = matrix(as.double(c(DWUnionParty),DWSouth),rLen,2)
	#rownames(DWUnionParty)<-allrows
	rownames(DWUnion)<-allrows
	rownames(countsSummed)<-allrows
	colnames(countsSummed)<-vocabSub
	wc = as.matrix(col_sums(countsSummed))
	rownames(wc) <- vocabSub
	#wcOrder = as.matrix(order(wc[,1]))
	wcOrder = 1:sLen
	wcInOrder = as.matrix(wc[wcOrder])
	vocabWCO <- vocabSub[wcOrder]
	rownames(wcInOrder) <- vocabWCO
	wcInOrder[(sLen-100):sLen,]
	vocShort <- as.matrix(vocabWCO[(sLen-vLimit+1):sLen])
	countsWCO = countsSummed[,wcOrder]
	rownames(countsWCO)<-allrows
	colnames(countsWCO)<-vocabWCO
	lNum = dim(DWAll)[1]
	
	#Uncomment this set of lines for TrimmingWithGrep
	lettersRe = ""
	if(gramSize == 1){lettersRe = "[a-z]+"}
	if(gramSize == 2){lettersRe = "[a-z]+\\.[a-z]+"}
	if(gramSize == 3){lettersRe = "[a-z]+\\.[a-z]+\\.[a-z]+"}
	indicesTrimmed <- grep(lettersRe,colnames(countsWCO))
	sLenTrimmed = length(indicesTrimmed)
	wcTrimmed <- wc[indicesTrimmed]
	vocabWCOTrimmed <- vocabWCO[indicesTrimmed]
	vocShortTrimmed <- as.matrix(vocabWCOTrimmed[(sLenTrimmed-vLimit+1):sLenTrimmed])
	countsWCOTrimmed = countsWCO[,indicesTrimmed]
	countsNormedTrimmed = normalize(as.matrix(countsWCOTrimmed[,(sLenTrimmed-vLimit+1):sLenTrimmed]))/sqrt(lNum)
	dwNormed = normalize(as.matrix(DWUnion))/sqrt(lNum)
	#dwNormedParty = normalize(as.matrix(DWUnionParty))/sqrt(lNum)
	correl = t(countsNormedTrimmed)%*%dwNormed
	#correlP = t(countsNormedTrimmed)%*%dwNormedParty
	allPred = countsNormedTrimmed%*%correl
	allPredSouth = (allPred[,3]>0.3)*1
	allPredParty = ((allPred[,4])>0)*1+(allPred[,4]<0)*-1
	allPredWClass = cbind(allPred,allPredSouth,allPredParty)
	write.csv(allPredWClass,paste("rawdata\\Routput\\cong",congNo,"_correlPredictions",gramName,".csv"))
	
	k = 1
	maxK = 1
	#two validations (in-sample and out of sample), one on full sample, one on 75% randomly sampled training set
	valAvg = matrix(0,nrow=6,ncol=4)
	#K-fold cross-validation

	for(k in 1:maxK){	
	seed<-3 #different than seed used in paper; paper seed not saved, find .csv files for replicating that run in Routput_replicate.zip
	set.seed(seed)
	train = sample(lNum,size=lNum*.75)

	correlValtrain = t(countsNormedTrimmed[train,])%*%dwNormed[train,]
	correlVal = t(countsNormedTrimmed)%*%dwNormed
	#correlVal[is.na(correlVal)]<-0

	selfPredtrain = countsNormedTrimmed[train,]%*%correlValtrain
	selfPredCorrtrain = colSums(normalize(selfPredtrain)*normalize(dwNormed[train,]))/dim(countsNormedTrimmed[train,])[1]
	selfPredSouthtrain = ((selfPredtrain[,3])>0.3)*1
	selfPredSouthClasstrain = sum(selfPredSouthtrain == DWUnion[train,3])/dim(DWUnion[train,])[1]
	selfPredPartytrain = ((selfPredtrain[,4])>0)*1+(selfPredtrain[,4]<0)*-1
	selfPredPartyClasstrain = sum(selfPredPartytrain == DWUnion[train,4])/dim(DWUnion[train,])[1]
	selfPredCorrWClasstrain = c(selfPredCorrtrain,selfPredSouthClasstrain,selfPredPartyClasstrain)	

	selfPred = countsNormedTrimmed%*%correlVal
	selfPredCorr = colSums(normalize(selfPred)*normalize(dwNormed))/dim(countsNormedTrimmed)[1]
	selfPredSouth = ((selfPred[,3])>0.3)*1
	selfPredSouthClass = sum(selfPredSouth == DWUnion[,3])/dim(DWUnion)[1]
	selfPredParty = ((selfPred[,4])>0)*1+(selfPred[,4]<0)*-1
	selfPredPartyClass = sum(selfPredParty == DWUnion[,4])/dim(DWUnion)[1]
	selfPredCorrWClass = c(selfPredCorr,selfPredSouthClass,selfPredPartyClass)

	valPred = countsNormedTrimmed[-train,]%*%correlVal
	valPredCorr = colSums(normalize(valPred)*normalize(dwNormed[-train,]))/dim(countsNormedTrimmed[-train,])[1]	
	valPredSouth = (valPred[,3]>0.3)*1
	valPredSouthClass = sum(valPredSouth == DWUnion[-train,3])/dim(DWUnion[-train,])[1]
	valPredParty = (valPred[,4]>0)*1+(valPred[,4]<0)*-1
	valPredPartyClass = sum(valPredParty == DWUnion[-train,4])/dim(DWUnion[-train,])[1]
	valPredCorrWClass = c(valPredCorr,valPredSouthClass,valPredPartyClass)

	valPredtrain = countsNormedTrimmed[-train,]%*%correlValtrain
	valPredCorrtrain = colSums(normalize(valPredtrain)*normalize(dwNormed[-train,]))/dim(countsNormedTrimmed[-train,])[1]	
	valPredSouthtrain = (valPredtrain[,3]>0.3)*1
	valPredSouthClasstrain = sum(valPredSouthtrain == DWUnion[-train,3])/dim(DWUnion[-train,])[1]
	valPredPartytrain = (valPredtrain[,4]>0)*1+(valPredtrain[,4]<0)*-1
	valPredPartyClasstrain = sum(valPredPartytrain == DWUnion[-train,4])/dim(DWUnion[-train,])[1]
	valPredCorrWClasstrain = c(valPredCorrtrain,valPredSouthClasstrain,valPredPartyClasstrain)
	#Rename: Untrained is on 100% sample, trained is on 75% sample, and should be the default
	selfPredCorrWClassUntrained<-	selfPredCorrWClass
	valPredCorrWClassUntrained<- valPredCorrWClass
	selfPredCorrWClass<-	selfPredCorrWClasstrain
      valPredCorrWClass<-valPredCorrWClasstrain
	validation = cbind(selfPredCorrWClass,valPredCorrWClass,selfPredCorrWClassUntrained,valPredCorrWClassUntrained)
	if(k==1){
		wcShort = as.matrix(wcTrimmed[(sLenTrimmed-vLimit+1):sLenTrimmed])
		googShort = as.matrix(googSub[as.matrix(vocShortTrimmed),3])
		correl = cbind(correl,wcShort)
		correl = cbind(correl,googShort)
		#correlP = cbind(correlP,wcShort)
		rownames(correl) <- vocabWCOTrimmed[(sLenTrimmed-vLimit+1):sLenTrimmed]
		#rownames(correlP) <- vocabWCOTrimmed[(sLenTrimmed-vLimit+1):sLenTrimmed]
		write.csv(correl,paste("rawdata\\Routput\\cong",congNo,"_correlationswithclassification",gramName,".csv"))
	}


	#vocHash = hash(colnames(countsWCOTrimmed),(1:sLenTrimmed))
	#regrInd = values(vocHash,rownames(correlVal))
	#regrCounts = normalize(as.matrix(countsWCOTrimmed[,regrInd]))
	#predsCong = as.matrix(regrCounts)%*%as.matrix(correlVal)
	#valPredCorr = colSums(normalize(predsCong)*normalize(dwNormed))/dim(regrCounts)[1]	
	#valPredCorr

	
	valAvg = as.matrix(validation) 
	}	
	valAvg=valAvg/maxK
	write.csv(valAvg,paste("rawdata\\Routput\\cong",congNo,"_validationswithclassification",gramName,".csv"))
	congStart <-congStart+congInt
	congEnd =min(congStart+congInt-1, 111)


}