	clear all
	set more off
	set mem 5g

	global ncong 111

	/*go to directory with R output*/
	*cd "D:\Data\CongressionalRecord\ReplicationFinalRCode\factorloadings"
	 cd "rawdata"
	 forvalues y = 43/$ncong{
		tempfile temp`y' tempcu`y'
		
	}

/****Uncomment if using fresh R-generated files
	insheet using "numpersons.csv", clear
	drop v1
	rename v2 congress
	rename v3 numperson
	duplicates drop
	sort congress numperson
	save "numberpersonfile.dta", replace
****/

/*To replace Routput_replication directory with Routput to use R generated files*/
	forvalues y = 43/$ncong{
		insheet using "Routput_replication\cong `y' _correlationswithclassification Tri .csv", names clear			
		
		rename v1 phrase_name
		rename v2 plscorr1_trimmed
		rename v3 plscorr2_trimmed
		rename v4 plscorrdwSouth_trimmed
		rename v5 plscorrparty_trimmed
		rename v6 freq
		rename v7 googlefreq
		sum plscorrparty_trimmed
		gen mostdem=plscorrparty_trimmed<`r(min)'+.00001
		gen mostrep=plscorrparty_trimmed>`r(max)'-.00001
		
		gen congress=`y'
		/*get number of congressmen used to calculate correlations*/
		merge m:1 congress using "numberpersonfile.dta"
		keep if _merge ==3
		drop _merge
		gen std_error = sqrt((1-plscorrparty_trimmed*plscorrparty_trimmed)/(numperson-2) )
		save `tempcu`y'', replace
		
	}
	use `tempcu43', clear
	forvalues y = 44/$ncong{
		append using `tempcu`y''
	 
	}
	sort phrase_name congress

	gen not_has_number_as_word = regex(phrase_name, "[a-z]+\.[a-z]+\.[a-z]+")
	gen has_only_alphacharacters = regex(phrase_name, "^[a-z|\.]+$")
	gen has_only_characters_and_numbers = regex(phrase_name, "^[a-z0-9|\.]+$")

	save "../TriGramLoadings_withvalidations.dta", replace
	/******generate Table 1 and Table 2********/
	sort congress
	log using "../Pics_Tables/table1table2.log", replace
	/*table 1*/
	/* Standard errors in paper are incorrect; this generates correct standard errors for Table 1 and 2*/
	list congress phrase_name plscorrparty_trimmed std_error* if mostdem==1 |mostrep==1
	keep if congress==110
	/*table 2*/
	gsort plscorrparty_trimmed
	list  phrase_name plscorrparty_trimmed std_error if _n<=50
	gsort -plscorrparty_trimmed
	list  phrase_name plscorrparty_trimmed std_error if _n<=50
	log close

	
	/***To replicate what is in paper***/
	clear all
	/*get validation numbers for pics*/
	forval k=43(1)111 {
	insheet using "Routput_replication\cong `k' _validationswithclassification Tri .csv"
	/*these are ones that predict correlation with party using training data*/
	keep selfpredcorrwclass valpredcorrwclass v1
	/*these are ones that predict correlation with party using training data*/
	keep if v1 >5
	g congress=`k'
	save `temp`k''
	clear
	}
	use `temp43'
	forval k=44(1)111 {
	append using `temp`k''
	}
	label var valpredcorrwclass "Out of Sample Percent Predicted Correct"
	label var selfpredcorrwclass "In Sample Percent Predicted Correct"
	
	g Year=2011-(112-cong)*2

	scatter selfpredcorrwclass valpredcorrwclass Year if Year>1875, ytitle(Percentage Correct) title(Percentage of House Members Predicted Correctly) subtitle(by Year) xscale(range(1870 2008)) xmtick(1870(10)2008) note(% of Political Party Affiliations Correctly Predicted by Frequency-Weighted Partisanship Score) msymbol(Oh O) legend(rows(2))
	graph save "..\Pics_Tables\pct_correctly_predicted", replace
	graph export "..\Pics_Tables\pct_correctly_predicted.png", replace
	graph export "..\Pics_Tables\pct_correctly_predicted.eps", replace
	