

clear all
set mem 5g
set more off
*global path "D:\Data\CongressionalRecord"
*cd "$path\replication"


**** Create Gooogle Phrase Data Set for Persistently Partisan Phrases and Frequent Phrases ****
insheet using "rawdata\googleYears3USscrape_all10K_sept_25_2012.csv",clear
rename v1 phrase_name
rename v2 year
rename v3 googlecounts
keep if year>1872 & year<2010
g mod2year=trunc((year-1861)/2)
g congress=mod2year+37
drop mod2year year
collapse (sum) googlecounts, by(phrase_name congress)
bys congress: egen totgoogcounts=sum(googlecounts)
gen googlerelfreq=googlecounts/totgoogcounts
sort congres phrase_name
merge congres phrase_name using "rawdata\google2-3renamed.dta"
drop if _merge==2
**** Merge with Congressional Record Data ****
merge 1:1 congres phrase_name using "TriGramLoadings_withvalidations.dta", gen(merge3freq)
gen trigram=1
rename googlefreq overallgooglefreq
gen googlefreq=googlerelfreq



gen year=2011-(112-congress)*2
gen decade=trunc(year/10)*10
replace freq=0 if freq==.
bys year: egen sumfreq=sum(freq)
gen relfreq=freq/sumfreq

gen googlecountgap= googlecounts_all -googlecounts
replace googlecountgap =0 if googlecountgap<0
egen totgap=sum(googlecountgap ), by(congress )
gen gapfreq=googlecountgap /totgap

egen phraseno=group(phrase_name)
tsset phraseno congress
gen dempersist_t =(plscorrparty_trimmed <0 &L.plscorrparty_trimmed <0&L2.plscorrparty_trimmed <0 & L3.plscorrparty_trimmed <0 &L4.plscorrparty_trimmed <0)
gen plscorrparty_trimmed_repub =plscorrparty_trimmed >0 &plscorrparty_trimmed!=.
gen repubpersist_t =(plscorrparty_trimmed_repub >0 &L.plscorrparty_trimmed_repub >0&L2.plscorrparty_trimmed_repub >0 & L3.plscorrparty_trimmed_repub >0 &L4.plscorrparty_trimmed_repub >0)


egen dempersist = max(dempersist_t==1), by(phrase_name)
egen repubpersist = max(repubpersist_t==1), by(phrase_name)
gen partisanpersist = dempersist|repubpersist

 
save phrase23_panel_replication.dta, replace

use phrase23_panel_replication.dta, clear

****************Gen various time series**************
egen congparti_mean = sum(plscorrparty_trimmed) , by(year)
egen congparti = sum(relfreq*plscorrparty_trimmed) , by(year)

egen googleparti = sum(googlefreq*plscorrparty_trimmed), by(year)
egen congparti_persistent = sum(relfreq*plscorrparty_trimmed) if partisanpersist==1, by(year)
egen googleparti_persistent = sum(googlefreq*plscorrparty_trimmed) if partisanpersist==1, by(year)
egen congpol = sum(relfreq*abs(plscorrparty_trimmed)), by(year)
egen googlepol= sum(googlefreq*abs(plscorrparty_trimmed)) , by(year)

egen googleparti_all = sum(gapfreq*plscorrparty_trimmed), by(year)
egen googlepol_all= sum(gapfreq*abs(plscorrparty_trimmed)) , by(year)


egen congpol_persistent = sum(relfreq*abs(plscorrparty_trimmed)) if partisanpersist==1, by(year)
egen googlepol_persistent= sum(googlefreq*abs(plscorrparty_trimmed)) if partisanpersist==1, by(year)



collapse (mean) *parti* *pol*, by(year congress)
bys year:sum
sort year
save pol23_time_series_replication.dta, replace
