*************************************************************
**This code create the dataset and remove respondents whose** 
***survey is incomplete or who failed the attention checks***
*************************************************************
/* Set root directory */
*global dir " " //set origin path
global survey_data "$dir/replication_package/data"


import excel "$survey_data/BPEA_survey_B_1.xlsx",  firstrow clear

drop in 1


****dropping incompletes 
keep if screen=="1" 

**********************************
****** Time spent on survey ******
**********************************

destring Durationinseconds, replace force
gen Durationinminutes=Durationinseconds/60 
label var Durationinminutes "Minutes spent on survey"


************************************
********** Recoding timers *********
************************************ 
foreach var of varlist T* {
	destring `var', replace force
	}

** Date 
gen date_start = clock(StartDate, "MDY hm")
format date_start %tc
gen dailydate = dofc(date_start)
format dailydate %td


*********************************************************
*********************************************************
**# *********** REMOVING INATTENTIVES *******************
*********************************************************
*********************************************************

**generate binary variables to identify the collection waves part of collection_1
gen collection_i = (dailydate < date("2dec2023", "DMY"))
gen collection_ii = (dailydate < date("25jan2024", "DMY"))



************Find and remove respondents****************
********inattentive to open-ended questions (bots)*****


gen inatt_open=0

*STEP 1: to keep date and open ended questions (THESE MUST BE COMMENTED OUT IN LAST STEP) 
*keep inf_def_open_1 inf_info_int_why inf_info_bad_what inf_info_high_why inf_info_positive inf_yard_why_true inf_yard_why_false pers_imp_me pers_imp_feelings pers_imp_angry pers_imp_angry_who pers_imp_inc_change
*keep if dailydate == date("19oct2023", "DMY")




*STEP 2: to get rid of bad ones 


*1. 
/*
foreach j in  47 88 98 114 127 145 163 164 167 {
local rid_`j' = ResponseId[`j']
local list "`list' `rid_`j''"
}
di "`list'"
*/

*STEP 3: then take list from previous command and copy it below in the place of the one below, to create global_list_remove

global list_remove_1 ///
" R_1dzwyuYfhag6vp7 R_2f0XEddpX3Cv3P9 R_2bHHP7i2L53NXNK R_3R4ngKPPN0aQCyr R_OPTycE9KBakMNJn R_3h3VZi640RF9nC3 R_3KUCNDiTSMdS0le R_2cuB8FqbXzJOBkH R_bE2SOG3gqpLM6aJ"

global list_remove_2 /// from the January 24th wave
"R_6aOXUHa0tV8D6T2 R_6dTaxyBxq8I7UxA R_7CUXgbf4xih1K2s R_391AsUpTjwk5nnX R_52AbQtxt4y5AZat R_5sYamGg8r24dp3b R_67OPqXd4pQj2dnS R_7IQ8uv5ZxGW5XLH R_6tGRhMFklr93DDr R_7i5uYphsGAKKP0Z"


*STEP 4: assign dummy equal to inattentive for the above selected respondents

foreach name in $list_remove_1 $list_remove_2 {
	replace inatt_open = 1 if ResponseId == "`name'"
}



*********************** Inaccuracy variables ************************

*test to check whether the respondents report a different gender in the pre-screening questions asked by Lucid and our survey question
*** Lucid - Qualtrics ***
destring gender_lucid gender, replace 
gen test_gender = gender - gender_lucid

gen inacc_gender = 0 //INACC. divergence gender btw Lucid and Qualtrics
label var inacc_gender "Divergenge gender btw Lucid and Qualtrics"
replace inacc_gender = 1 if abs(test_gender) == 1



tab inacc_gender 

**************drop inattentive respondents***************

drop if inatt_open == 1
drop if (inacc_gender == 1 & gender != 3)

*********************************************************
*********************************************************
**# *********** ENCODING VARIABLES **********************
*********************************************************
*********************************************************

****COMMON LABELS
label define yes_no 1 "Yes" 2 "No" 

********************************************
******* Background questions - QUOTAS ******
********************************************


*gender
label define gender 1 "Male" 2 "Female" 3 "Other"
gen gender_en = gender
label var 	gender_en "Gender"
label values gender_en gender

*age
gen age = age_1
destring age, replace

*age_group: Qualtrics and respondents
rename age_group age_group_q
label var age_group_q "Age group qualtrics"

gen 			agegroup="." 
replace 		agegroup="18-29" if(age>17 & age<30)
replace 		agegroup="30-39" if(age>29 & age<40)
replace 		agegroup="40-49" if(age>39 & age<50)
replace 		agegroup="50-59" if(age>49 & age<60)
replace 		agegroup="60-69" if(age>59 & age<70)
label define 	agegroup 1 "18-29" 2 "30-39" 3 "40-49" 4 "50-59" 5 "60-69"
encode			agegroup, gen(age_group) label(agegroup)
drop 			agegroup

*live in the US
destring live_in_us, replace
label values live_in_us yes_no
label var live_in_us "Live in the US"

*area of living
label define area_us 1 "Northeast" 2 "South" 3 "Midwest" 4 "West"
destring us_area, replace
label values us_area area_us
label var us_area "Living area US"

*race
label define race 1 "White" 2 "African American/Black" 3 "Hispanic/Latino" 4 "Asian/Asian American" 5 "Mixed race" 6 "Other"
destring race, replace
label values race race
label var race "Race"

*income
rename income_group income_group_q
label var income_group_q "Qualtrics income group"

gen incomegroup="."
replace incomegroup="$0-$19999" if(income_bracket=="1"| income_bracket== "2"|income_bracket=="3")
replace incomegroup="$20000-$39999"	if(income_bracket=="4"  | income_bracket== "5")
replace incomegroup="$40000-$69999" 	if(income_bracket=="6"  | income_bracket== "7")
replace incomegroup="$70000-$99999" 	if(income_bracket=="8"  | income_bracket== "9")
replace incomegroup = "$100000-$124999" if (income_bracket=="10"  | income_bracket== "11")
replace incomegroup="$125000+" 		if(income_bracket=="12"| income_bracket== "13"|income_bracket=="14")

label define incomegroup 1 "$0-$19999" 2 "$20000-$39999" 3 "$40000-$69999" 4 "$70000-$99999" 5 "$100000-$124999" 6 "$125000+"
encode incomegroup, gen(income_group) label(incomegroup)
drop incomegroup
label var income_group "Income group"

*for a less coarse partition 
label define income_bracket 1 "$0-$9999" 2 "$10000-$14999" 3 "$15000-$19999" 4 "$20000-$29999" 5 "30000-39999" 6 "$40000-$49999" 7 "$50000-$69999" 8 "$70000-$89999" 9 "$90000-$99999" 10 "$100000-$109999" 11 "$110000-$124999" 12 "$125000-$149999" 13 "$150000-$199999" 14 "$200000+"
destring income_bracket, replace
label values income_bracket income_bracket

************************************
************ Demographics **********
************************************
*born_us
destring born_us, replace
label values born_us yes_no
label var born_us "Born in the US"


*zip_code
*already encoded in 1.0creating_dataset.do


* Number of children
label define nb_child 1 "I do not have children" 2 "1" 3 "2" 4 "3" 5 "4" 6 "5 or more"
destring children, replace
label values children nb_child
label var children "N. of children"

* Highest level of education
label define education_level 1 "Primary edu or less" 2 "Some High School" 3 "High School degree/ GED" 4 "Some College" 5 "2 year College Degree" 6 "4 year College Degree" 7 "Masters Degree" 8 "Doctoral Degree" 9 "Professional Degree (JD, MD, MBA)"
destring educ, gen(education_level)
label values education_level education_level
label var education_level "Education level"


* Field of study
label define field_of_study 1 "Accounting/bookkeeping" 2 "Administrative science/public administration" 3 "Advertising" 4 "Agriculture/horticulture" 5 "Allied health" 6 "Anthropology"  7 "Architecture" 8 "Art" 9 "Aviation/aeronatics" 10 "Biology" 11 "Business administration" 12 "Chemistry" 13 "Child/human/family development" 14 "Comm. disorders" 15 "Communications/speech" 16 "Computer science" 17 "Counseling" 18 "Criminology/criminal justice" 19 "Dance" 20 "Dentistry" 21 "Economics" 22 "Education" 23 "Educational administration" 24 "Electronics" 25 "Engineering" 26 "English" 27 "Environmental science/ecology" 28 "Ethnic studies" 29 "Fashion" 30 "Finance" 31 "Fine arts" 32 "Food science/nutrition/culinary arts" 33 "Foreign language" 34 "Forestry" 35 "General sciences" 36 "General studies" 37 "Geography" 38 "Geology" 39 "Gerontology" 40 "Health" 41 "History" 42 "Home economics" 43 "Human services/human resources" 44 "Humanities" 45 "Industrial relations" 46 "Industry and technology" 47 "Information technology" 48 "Journalism" 49 "Law" 50 "Law enforcement" 51 "Liberal arts" 52 "Library science" 53 "Marketing" 54 "Mathematics" 55 "Mechanics/machine trade" 56 "Medicine" 57 "Music" 58 "Nursing" 59 "Other vocational" 60 "Parks and recreation" 61 "Pharmacy" 62 "Philosophy" 63 "Physical education" 64 "Physics" 65 "Political science/international relations" 66 "Psychology" 67 "Public relations" 68 "Social sciences" 69 "Social work: Sociology" 70 "Special education" 71 "Statistics/biostatistics" 72 "Television/film" 73 "Textiles/cloth" 74 "Theater arts" 75 "Theology" 76 "Urban and regional planning" 77 "Veterinary medicine" 78 "Visual arts/graphic design/ design and drafting" 79 "Other"
destring field_of_study, replace
label val field_of_study field_of_study
label var field_of_study "Field of study"

* Current employment status
label define emp_status 1 "Full-time employee" 2 "Part-time employee" 3 "Self-employed or business owner" 4 "Unemployed and looking for work" 5 "Student" 6 "Not currently working and not looking for a job" 7 "Retiree"
destring emp_status, replace
label values emp_status emp_status
label var emp_status "Employment status"
	
* Main occupation (if currently employed)
destring emp_occupation, replace
label define emp_occupation 1 "Management, business and financial occupations" 2 "Professional and related occupations" 3 "Service occupations" 4 "Sales and related occupations" 5 "Office and administrative support occupations" 6 "Farming, fishing and forestry occupations" 7 "Construction and extraction occupations" 8 "Installation, maintenance and repair occupations" 9 "Production occupations" 10 "Transportation and material moving occupations" 11 "Armed forces" 12 "Other"
label values emp_occupation emp_occupation
label var emp_occupation "Main occupation"

* Main occupation (if currently unemployed)
destring unemp_occupation, replace
label define unemp_occupation 1 "Management, business and financial occupations" 2 "Professional and related occupations" 3 "Service occupations" 4 "Sales and related occupations" 5 "Office and administrative support occupations" 6 "Farming, fishing and forestry occupations" 7 "Construction and extraction occupations" 8 "Installation, maintenance and repair occupations" 9 "Production occupations" 10 "Transportation and material moving occupations" 11 "Armed forces" 12 "Other"
label values unemp_occupation unemp_occupation
label var unemp_occupation "Latest main occupation"

* Employment sector (if currently employed)
destring emp_sector, replace	
label define emp_sector 1 "Agriculture, plantations, other rural sectors" 2 "Basic metal production" 3 "Chemical industries" 4 "Commerce" 5 "Construction" 6 "Education" 7 "Financial services, professional services" 8 "Food, drink, tobacco" 9 "Forestry, wood" 10 "Health services" 11 "Hotels, tourism, catering" 12 "Mining" 13 "Mechanical and electrical engineering" 14 "Media, culture, graphical" 15 "Oil and gas production, oil refining" 16 "Postal and telecommunications services" 17 "Public service" 18 "Shipping, ports, fisheries, inland waterways" 19 "Textiles, clothing, leather, footwear" 20 "Transport (including civil aviation, railways, road transport)" 21 "Transport equipment manufacturing" 22 "Utilities (water, gas, electricity)" 23 "Other"
label values emp_sector emp_sector
label var emp_sector "Employment sector"

* Employment sector (if currently unemployed)
destring unemp_sector, replace	
label define unemp_sector 1 "Agriculture, plantations, other rural sectors" 2 "Basic metal production" 3 "Chemical industries" 4 "Commerce" 5 "Construction" 6 "Education" 7 "Financial services, professional services" 8 "Food, drink, tobacco" 9 "Forestry, wood" 10 "Health services" 11 "Hotels, tourism, catering" 12 "Mining" 13 "Mechanical and electrical engineering" 14 "Media, culture, graphical" 15 "Oil and gas production, oil refining" 16 "Postal and telecommunications services" 17 "Public service" 18 "Shipping, ports, fisheries, inland waterways" 19 "Textiles, clothing, leather, footwear" 20 "Transport (including civil aviation, railways, road transport)" 21 "Transport equipment manufacturing" 22 "Utilities (water, gas, electricity)" 23 "Other"
label values unemp_sector unemp_sector
label var unemp_sector "Latest employment sector"

* Gig economy
destring gig, replace
label values gig yes_no
label var gig "Gig economy"

* Marital status
label define marital_status 1 "Single" 2 "Married" 3 "Legally separated or divorced" 4 "Widowed"
destring marital_status, replace
label values marital_status marital_status
label var marital_status "Marital status"

* Spouse's employment status
destring spouse_emp_status, replace
label values spouse_emp_status emp_status
label var spouse_emp_status "Spouse's employment status"

* Income uncertainty
label define certainty_scale 0 "0 (Extremely uncertain)" 1 "1" 2 "2" 3 "3" 4 "4" 5 "5" 6 "6" 7 "7" 8 "8" 9 "9" 10 "10 (Extremely certain)"
destring income_uncertain, gen(income_uncertain_en)
label values income_uncertain_en certainty_scale


* Economic policy matters: liberal/conservative spectrum
label define lib_scale 1 "Very liberal" 2 "Liberal" 3 "Moderate" 4 "Conservative" 5 "Very conservative"
destring lib_scale, replace 
label values lib_scale lib_scale
label var lib_scale "Economic ideology"

* Political affiliation
label define pol_aff 1 "Republican" 2 "Democrat" 3 "Independent" 4 "Other" 5 "Non-affiliated"
destring pol_aff, replace
label values pol_aff pol_aff
label var pol_aff "Political affiliation"

* Vote in 2020 presidential election?
destring vote_2020, replace
label values vote_2020 yes_no
label var vote_2020 "Vote 2020"

* 2020 Preferred candidate (if they voted)
	label define vote_who 1 "Joe Biden" 2 "Donald Trump" 3 "Howie Hawkins" 4 "Jo Jorgensen" 5 "Other"
	destring vote_who, replace
	label values vote_who vote_who
	label var vote_who "Preferred candidate (voting)"

* 2020 Preferred candidate (if they did not vote)
destring novote_who, replace
label values novote_who vote_who
label var novote_who "Preferred candidate (not voting)"



*************************************************
** Economic Information about the household *****
*************************************************



foreach var in real_estate mortgage loans short_save cod long_save credit_card credit_card_debt {
destring `var', gen(`var'_en) 
label values `var'_en yes_no	
}


*real_estate_value 
label define real_estate_value 1 "$0-$49,999" 2 "$50,000-$99,999" 3 "$100,000-$149,999" 4 "$150,000-$199,999" 5 "$200,000-299,999" 6 "$300,000-$499,999" 7 "$500,000-$749,999" 8 "$750,000-999,999" 9 "$1,000,000-$1,499,999" 10 "$1,500,000-$1,999,999" 11 "$2,000,000-2,999,999" 12 "$3,000,000+"
foreach var in real_estate_value {
destring `var', gen(`var'_en) 
label values `var'_en real_estate_value	
}

label define rate 1 "All fixed-rate" 2 "All capped-variable-rate" 3 "All variable-rate" 4 "A mix of the previous three"
foreach var in mortgage_rate loans_rate {
destring `var', gen(`var'_en) 
label values `var'_en rate	
}

label define mort_loan_value 1 "$0-$49,999" 2 "$50,000-$99,999" 3 "$100,000-$149,999" 4 "$150,000-$199,999" 5 "$200,000-299,999" 6 "$300,000+"
foreach var in mortgage_value loans_value {
destring `var', gen(`var'_en) 
label values `var'_en mort_loan_value	
}

label define short_save_credit_card_value 1 "$0-$999" 2 "$1,000-$2,999" 3 "$3,000-$4,999" 4 "$5,000-$9,999" 5 "$10,000-$19,999" 6 "$20,000-$29,999" 7 "$30,000-$49,999" 8 "$50,000-99,999" 9 "$100,000-$149,999" 10 "$150,000-$199,999" 11 "$200,000-299,999" 12 "$300,000+"
foreach var in short_save_value credit_card_value {
destring `var', gen(`var'_en) 
label values `var'_en short_save_credit_card_value	
}


label define cod_long_save_value 1 "$0-$9,999" 2 "$10,000-$19,999" 3 "$20,000-$29,999" 4 "$30,000-$49,999" 5 "$50,000-99,999" 6 "$100,000-$149,999" 7 "$150,000-$199,999" 8 "$200,000-299,999" 9 "$300,000+"
foreach var in cod_value long_save_value {
destring `var', gen(`var'_en) 
label values `var'_en cod_long_save_value	
}


*save dataset to be able to upload it on Python to run the emotion classification code
save "$survey_data/BPEA_survey_B_1_encoded.dta", replace








