# you can download the CDC natality file at:-
# https://www.cdc.gov/nchs/data_access/VitalStatsOnline.htm#Births
# read in the file
library(utils) # this is a core R package, so no need to install it
# read.fwf is for reading data in fixed-width format
hdat=read.fwf('/home/jonathan/mplus/natality/Nat2019us/Nat2019PublicUS.c20200506.r20200915.txt', widths=570)
dim(hdat) # [1] 3757582

# functions to extract data from specific columns
anc=function(x,y) return(as.numeric(sapply(hdat,substr,x,y)))
yn=function(x){ z=sapply(hdat,substr,x,x); a=rep(0,length(z)); a[z=='Y']=1; a[z=='U']=NA; return(a)}

# extract the data named in the file
btime=anc(19,22); bday=anc(23,23); bplace=anc(32,32); mage=anc(75,76); mrace=anc(105,106)
meduc=anc(124,124); fage=anc(147,148); frace=anc(151,152); feduc=anc(163,163)
mhisp=anc(117,117); fhisp=anc(162,162); plural=anc(454,454)
precarem=anc(224,225); precaren=anc(238,239); patack=yn(119); marstat=anc(120,120); wic=yn(251)
plive=anc(171,172); pdead=anc(173,174); pterm=anc(175,176)
livord=anc(179,179); totord=anc(182,182); intbth=anc(198,200); intliv=anc(201,202); intoth=anc(206,208)
cig0=anc(253,254); cig0=anc(253,254); cig1=anc(255,256); cig2=anc(257,258); cig3=anc(259,260)
mht=anc(280,281); mbmi=anc(283,286); prewt=anc(292,294); delwt=anc(299,301); addwt=anc(304,305)
predm=yn(313); prgdm=yn(314); prebp=yn(315); prgbp=yn(316); labanalg=yn(388); eclmps=yn(317)
prevptb=yn(318); prevlscs=yn(331); numlscs=anc(332,333)
gonor=yn(343); syph=yn(344); chlm=yn(345); hepb=yn(346); hepc=yn(347); noinf=anc(353,353)
okver=yn(360); badver=yn(361); labind=yn(383); labaug=yn(384); labster=yn(385); labant=yn(386); choram=yn(387)
bpres=anc(401,401); brout=anc(402,402); trial=yn(403)
trans=yn(415); lacer=yn(416); ruptu=yn(417); uhyst=yn(418); adicu=yn(419)
attnd=anc(433,433); mtran=yn(434); payer=anc(435,435)
apgar5=anc(444,445); apgar10=anc(448,449); plural=anc(454,454)
mf=function(x) {y=sapply(hdat,substr,x,x); z=rep(0,length(y)); z[y=='M']=1; return(z)}
sexinf=mf(475); lmpm=anc(477,478); cgest=anc(490,491); obgest=anc(499,500); bwt=anc(504,507)
ventil=yn(517); ventil6=yn(518); adnicu=yn(519); bsurf=yn(520); banti=yn(521); bseiz=yn(522)
anenc=yn(537); mening=yn(538); chd=yn(539); dihern=yn(540); omph=yn(541); gastro=yn(542)
limb=yn(549); cleft=yn(550); clpal=yn(551); hypo=yn(554)
gd=function(x){y=sapply(hdat,substr,x,x); z=rep(0,length(y)); z[y=='C']=1; z[y=='P']=.5; z[y=='U']=-1; return(z)}
down=gd(552); chrom=gd(553)
itran=yn(567); ilive=yn(568); breast=yn(569)

# select subset of the data
hdat=data.frame(btime,bday,bplace,mage,mrace,meduc,marstat,wic,fage,frace,feduc,patack,plive,pdead,pterm,livord,totord,
intbth,intliv,intoth,plural,cig0,cig1,cig2,cig3,mht,mbmi,precarem,precaren,prewt,delwt,addwt,predm,prgdm,prebp,prgbp,eclmps,
prevptb,prevlscs,numlscs,gonor,syph,chlm,hepb,hepc,noinf,labind,labaug,labanalg,labster,labant,choram,bpres,brout,trial,
okver,badver,trans,lacer,ruptu,uhyst,adicu,attnd,mtran,payer,apgar5,apgar10,plural,sexinf,cgest,obgest,lmpm,bwt,ventil,ventil6,
adnicu,bsurf,banti,bseiz,anenc,mening,chd,dihern,omph,gastro,limb,cleft,clpal,hypo,down,chrom,itran,ilive,breast,fhisp,mhisp)

# feduc:-
# 1 = 8th grade or less
# 2 = 9th through 12 th grade with no diploma
# 3 = High school graduate or GED completed
# 4 = Some college credit, but not a degree.
# 5 = Associate degree (AA,AS)
# 6 = Bachelor’s degree (BA, AB, BS)
# 7 = Master’s degree (MA, MS, MEng, MEd, MSW, MBA)
# 8 = Doctorate (PhD, EdD) or Professional Degree (MD, DDS, DVM, LLB, JD)
#9 =Unknown
# frace:-
# 1 = White (only) [only one race reported]
# 2 = Black (only)
# 3 = AIAN (American Indian or Alaskan Native) (only)
# 4 = Asian (only)
# 5 = NHOPI (Native Hawaiian or Other Pacific Islander) (only)
# 6 = Black and White
# 7 = Black and AIAN
# 8 = Black and Asian
# 9 = Black and NHOPI
# 10 = AIAN and White
# 11 = AIAN and Asian
# 12 = AIAN and NHOPI
# 13 = Asian and White
# 14 = Asian and NHOPI
# 15 = NHOPI and White
# 16 = Black, AIAN, and White
# 17 = Black, AIAN, and Asian
# 18 = Black, AIAN, and NHOPI
# 19 = Black, Asian, and White
# 20 = Black, Asian, and NHOPI
# 21 = Black, NHOPI, and White
# 22 = AIAN, Asian, and White
# 23 = AIAN, NHOPI, and White
# 24 = AIAN, Asian, and NHOPI
# 25 = Asian, NHOPI, and White
# 26 = Black, AIAN, Asian, and White
# 27 = Black, AIAN, Asian, and NHOPI
# 28 = Black, AIAN, NHOPI, and White
# 29 = Black, Asian, NHOPI, and White
# 30 = AIAN, Asian, NHOPI, and White
# 31 = Black, AIAN, Asian, NHOPI, and White
# 99 = Unknown or Not Stated
# fhisp:-
# 1 = Non-Hispanic White (only)
# 2 = Non-Hispanic Black (only)
# 3 = Non-Hispanic AIAN (only)
# 4 = Non-Hispanic Asian (only)
# 5 = Non-Hispanic NHOPI (only)
# 6 = Non-Hispanic more than one race
# 7 = Hispanic
# 8 = Origin unknown or not stated
# 9 = Race unknown or not stated (Non-Hispanic)
# payer:- 
# 1=Medicaid; 2=private insurance; 3=self; 4=Indian Health Service;
# 5=CHAMPUS/TRICARE (military); 6=other Government; 8 = Other (known); 9 = unknown

# save subset for later quick review
write.csv(hdat,'/home/jonathan/Desktop/natality/usnatal19_071121.csv')

if (!exists('hdat')) hdat=read.csv('/home/jonathan/Desktop/natality/usnatal19_071121.csv')


# define missing values and transform data
dim(hdat); names(hdat)
hdat$btime[hdat$btime==9999]=NA; hdat$bplace[hdat$bplace==9]=NA
hdat$mrace[hdat$mrace==99]=NA; hdat$mage[hdat$mage==99]=NA; hdat$meduc[hdat$meduc==9]=NA
hdat$frace[hdat$frace==99]=NA; hdat$fage[hdat$fage==99]=NA; hdat$feduc[hdat$feduc==9]=NA

hdat$plive[hdat$plive==99]=NA; hdat$pdead[hdat$pdead==99]=NA; hdat$pterm[hdat$pterm==99]=NA
hdat$livord[hdat$livord==9]=NA; hdat$totord[hdat$totord==9]=NA; hdat$intbth[hdat$intbth>=888]=NA
hdat$intliv[hdat$intliv>=888]=NA; hdat$intoth[hdat$intoth>=888]=NA
hdat$mhisp[hdat$mhisp>7]=NA; hdat$fhisp[hdat$fhisp>7]=NA
hdat$mhisp=as.numeric(hdat$mhisp==7); hdat$fhisp=as.numeric(hdat$fhisp==7)

# convert Imperial measures to SI
hdat$mht[hdat$mht==99]=NA; hdat$mht=hdat$mht*.0254; quantile(hdat$mht,na.rm=T)
hdat$prewt[hdat$prewt==999]=NA; hdat$prewt=hdat$prewt*0.453592; quantile(hdat$prewt,na.rm=T)
hdat$delwt[hdat$delwt==999]=NA; hdat$delwt=hdat$delwt*0.453592; quantile(hdat$delwt,na.rm=T)
hdat$addwt[hdat$addwt==999]=NA; hdat$addwt=hdat$addwt*0.453592; quantile(hdat$addwt,na.rm=T)

# define mean cigarettes per day, both before and during pregnancy
hdat$cig0[hdat$cig0==99]=NA; hdat$cig1[hdat$cig1==99]=NA; 
hdat$cig2[hdat$cig2==99]=NA; hdat$cig3[hdat$cig3==99]=NA; 
hdat$logcig0=log(hdat$cig0+1); hdat$logcig1=log(hdat$cig1+1)
hdat$logcig2=log(hdat$cig2+1); hdat$logcig3=log(hdat$cig3+1)
hdat$totcig=log((hdat$cig0+hdat$cig1+hdat$cig2+hdat$cig3)/4+1)
table(hdat$cig0>0,hdat$cig1>0)

# define more missing values
hdat$mbmi[hdat$mbmi==99.9]=NA; quantile(hdat$mbmi,na.rm=T)
hdat$numlscs[hdat$numlscs==99]=NA
hdat$bpres[hdat$bpres==9]=NA; hdat$bpres=factor(hdat$bpres) #fetal presentation
hdat$brout[hdat$brout==9]=NA; hdat$brout=factor(hdat$brout) #route of delivery
hdat$cgest[hdat$cgest==99]=NA; hdat$obgest[hdat$obgest==99]=NA
hdat$bwt[hdat$bwt==9999]=NA; table(is.na(hdat$bwt)); hdat$apgar5[hdat$apgar5==99]=NA

# estimate non-Hispanic European American parentage
hdat$eurom=0; hdat$eurom[hdat$mrace==1]=1; hdat$eurom[hdat$mrace%in%c(6,10,13,15)]=.5
hdat$eurom[hdat$mrace%in%c(16,19,21:23,25)]=.33
hdat$eurom[hdat$mrace%in%c(26,28:30)]=.25
hdat$eurom[hdat$mrace==31]=.125
hdat$eurof=0; hdat$eurof[hdat$frace==1]=1
hdat$eurof[hdat$frace%in%c(6,10,13,15)]=.5
hdat$eurof[hdat$frace%in%c(16,19,21:23,25)]=.33
hdat$eurof[hdat$frace%in%c(26,28:30)]=.25
hdat$eurof[hdat$frace==31]=.125; hdat$eurof[is.na(hdat$frace)]=NA

# estimate African American parentage
hdat$aframm=0; hdat$aframm[hdat$mrace==2]=1
hdat$aframm[hdat$mrace%in%c(6:9)]=.5
hdat$aframm[hdat$mrace%in%c(16:21)]=.33
hdat$aframm[hdat$mrace%in%c(26:29)]=.25
hdat$aframm[hdat$mrace==31]=.125
hdat$aframf=0; hdat$aframf[hdat$frace==2]=1
hdat$aframf[hdat$frace%in%c(6:9)]=.5
hdat$aframf[hdat$frace%in%c(16:21)]=.33
hdat$aframf[hdat$frace%in%c(26:29)]=.25
hdat$aframf[hdat$frace==31]=.125; 
hdat$aframf[is.na(hdat$frace)]=NA

# estimate NHPI parentage
hdat$nhpim=0; hdat$nhpim[hdat$mrace==5]=1
hdat$nhpim[hdat$mrace%in%c(9,12,14:15)]=.5
hdat$nhpim[hdat$mrace%in%c(18,20,21,23:25)]=.33
hdat$nhpim[hdat$mrace%in%c(27:30)]=.25
hdat$nhpim[hdat$mrace==31]=.125
hdat$nhpif=0; hdat$nhpif[hdat$frace==5]=1
hdat$nhpif[hdat$frace%in%c(9,12,14:15)]=.5
hdat$nhpif[hdat$frace%in%c(18,20,21,23:25)]=.33
hdat$nhpif[hdat$frace%in%c(27:30)]=.25
hdat$nhpif[hdat$frace==31]=.125
# very odd that table(hdat$nhpif,hdat$aframf) -> some nhpif=1 & aframf=.5!
# I have checked repeatedly and cannot find a coding error
# therefore, rectify directly by recoding to 0.5
hdat$nhpif[hdat$frace==9 & hdat$nhpif==1 & hdat$aframf==.5]=.5
hdat$nhpif[hdat$frace==8 & hdat$nhpif==1 & hdat$aframf==.5]=0
hdat$nphif[is.na(hdat$frace)]=NA

# estimate Asian American parentage
hdat$asianm=0; hdat$asianm[hdat$mrace==4]=1
hdat$asianm[hdat$mrace%in%c(8,11,13,14)]=.5
hdat$asianm[hdat$mrace%in%c(17,19,20,22,24)]=.33
hdat$asianm[hdat$mrace%in%c(26,27,29,30)]=.25
hdat$asianm[hdat$mrace==31]=.125
hdat$asianf=0; hdat$asianf[hdat$frace==4]=1
hdat$asianf[hdat$frace%in%c(8,11,13,14)]=.5
hdat$asianf[hdat$frace%in%c(17,19,20,22,24)]=.33
hdat$asianf[hdat$frace%in%c(26,27,29,30)]=.25
hdat$asianf[hdat$frace==31]=.125
hdat$asianf[is.na(hdat$frace)]=NA

# estimate AIAN American parentage
hdat$aianm=0; hdat$aianm[hdat$mrace==3]=1
hdat$aianm[hdat$mrace%in%c(7,10:12)]=.5
hdat$aianm[hdat$mrace%in%c(16:18,22:24)]=.33
hdat$aianm[hdat$mrace%in%c(26:28,30)]=.25
hdat$aianm[hdat$mrace==31]=.125
hdat$aianf=0; hdat$aianf[hdat$frace==3]=1
hdat$aianf[hdat$frace%in%c(7:10,12)]=.5
hdat$aianf[hdat$frace%in%c(16:18,22:24)]=.33
hdat$aianf[hdat$frace%in%c(26:28,30)]=.25
hdat$aianf[hdat$frace==31]=.125
hdat$aianf[is.na(hdat$frace)]=NA

# functions to describe the data
tdesc=function(x) {print(table(is.na(x))); print(table(x))}
qdesc=function(x) {print(table(is.na(x))); print(quantile(x, na.rm=T))}

# more data transformations and definitions of missing values
hdat$bmi=round(hdat$prewt/(hdat$mht^2),1); qdesc(hdat$bmi)
hdat$precaren[hdat$precaren==99]=NA; tdesc(hdat$precaren)
hdat$precarem[hdat$precarem==99]=NA; tdesc(hdat$precarem)

# mothers with precarem of zero had no pre-natal care (PNC)
# in one sense, they left seeking care too late -
# so they more resemble mothers who began PNC in the 9th or 10th month of gestation
# there is also the question whether some of these mothers recognised the pregnancy
# obstetric and maternal estimates of duration of gestation correlate quite poorly
# therefore, recode PNC=0, to PNC=11
hdat$precarem[hdat$precarem==0]=11

# recode source of payment; 2 & 3 = private, 1 & 4:8 = government
hdat$jpay=hdat$payer
hdat$payer[hdat$payer%in%c(1,4:7)]=0; hdat$payer[hdat$payer%in%c(2,3)]=1
hdat$payer[hdat$payer%in%c(8,9)]=NA; table(hdat$payer)

# select only first primigravidae
dim(hdat); table(is.na(hdat$X))
hdat=hdat[!is.na(hdat$X),]; dim (hdat) # 3748546 113
hdat=hdat[!is.na(hdat$plural),]; dim(hdat); tdesc(hdat$plural)
hdat=hdat[!is.na(hdat$livord),]; dim(hdat); tdesc(hdat$livord)
hdat=hdat[!is.na(hdat$prevlscs),]; dim(hdat)
hdat=hdat[hdat$plural==1 & hdat$livord==1 & hdat$pterm==0 & hdat$prevlscs==0,]; hdat=hdat[!is.na(hdat$X),]; dim(hdat)
# [1] 1142705 115

# exclude mothers with pre-existing diabetes or hypertension
hdat=hdat[hdat$predm==0 & hdat$prebp==0,]; dim(hdat)
# [1] 1115025     115

# exclude mothers with extreme values of age, height or BMI
hdat=hdat[hdat$mage>14 & hdat$mage<=45,]; dim(hdat) 
hdat=hdat[hdat$mht>=1.4 & hdat$mht<1.88 & hdat$bmi>16.5 & hdat$bmi<40,]; dim(hdat) 
# [1] 1057898     115

# exclude babies with extreme values of weight or anencephaly
hdat=hdat[hdat$bwt>1000 & hdat$bwt<=5000,]; dim(hdat); qdesc(hdat$bwt) # [1] 1052034     115
hdat=hdat[hdat$anenc==0,]; dim(hdat) # 1051952     115

# exclude premature and postmature babies
hdat=hdat[hdat$obgest<=42 & hdat$obgest>=35,]; dim(hdat)
# [1] 1026746     115

# compute 'other' parental ethnicity = Asian + AIAN + NHPI
hdat$othm = hdat$asianm+hdat$aianm+hdat$nhpim; table(hdat$othm)
hdat$othf = hdat$asianf+hdat$aianf+hdat$nhpif; table(hdat$othf)
table(is.na(hdat$X)); dim(hdat)
#   FALSE    TRUE 
# 1002191   24555 
# [1] 1026746     117

# exclude births with any missing paternal demographic data
x1=!is.na(hdat$frace) & !is.na(hdat$fhisp)
x2=!is.na(hdat$feduc) & !is.na(hdat$fage)
x3=!is.na(hdat$mrace) & !is.na(hdat$mhisp)

# exclude births with any missing maternal demographic data
x4=!is.na(hdat$meduc) & !is.na(hdat$mage)
x5=!is.na(hdat$totcig)

# exclude births with any other relevant variables missing
x7=!is.na(hdat$mht) & !is.na(hdat$mbmi)
x8=!is.na(hdat$payer) & !hdat$payer%in%c(8,9) & !is.na(hdat$wic)
x9=!is.na(hdat$prgbp) & !is.na(hdat$prgdm)
x10=!is.na(hdat$obgest) & !is.na(hdat$bwt) & !is.na(hdat$apgar5)
x12=!is.na(hdat$precarem) & hdat$precarem!=0

xrace = x1 & x3; xdad = x1 & x2; xmum = x3 & x4 & x5
xprep = x7 & x8 & x12; xpreg = x9 & x10
xtot = xrace & xdad & xmum & xprep & xpreg; table(xtot)
#  FALSE   TRUE 
# 250648 776098

table(hdat$precarem)
xok=hdat$fage>14 & hdat$fage<60 & hdat$precarem<9; table(xok[xtot])
# FALSE   TRUE 
# 12475 763623 
dim(hdat[xtot & xok,]) # [1] 763623    117

table(xtot,hdat$aframm)

hvs=c('X','fage','aframf','fhisp','othf','feduc','payer','aframm','mhisp','othm','mage','mht',
'meduc','bmi','totcig','wic','precarem','prgbp','prgdm','obgest','bwt','apgar5','sexinf')
#for (i in hvs) {print(i); print(table(is.na(hdat[,i])))}
dim(na.omit(hdat[xtot & xok,hvs])); # 763623     23

table(is.na(hdat[xtot & xok,hvs])) # F 17563329
write.table(hdat[xtot & xok,hvs],file='/home/jonathan/Desktop/natality/all first singletons 210724.dat',na='.',
quote=F, row.names=F, col.names=F)

cvs=c(hvs,'gonor','syph','chlm','hepb','hepc')
dim(na.omit(hdat[xtot & xok, cvs])); table(is.na(hdat[xtot & xok,cvs]))
dim(na.omit(hdat[xtot & xok & hdat$fage>14 & hdat$fage<60 & hdat$precarem<9,cvs])) # 762786  28
# 762786 is the number of cases in the Mplus output

write.table(na.omit(hdat[xtot & xok,cvs]),file='/home/jonathan/Desktop/natality/chlm first singletons 210724.dat',na='.',
quote=F, row.names=F, col.names=F)

##############################################################################################
##############################################################################################
zdat=na.omit(hdat[xtot & xok,cvs])
ztot=hdat$X%in%zdat$X; table(ztot)
#  FALSE   TRUE 
# 263960 762786 

# test differences between complete cases (selected) and cases with missing values
jx=function(x) {print(x); print('+++++++++++++++++++++++++++++++++')
print(tapply(hdat[,x],ztot,quantile,na.rm=T, c(.5,.25,.75)))
if (length(unique(hdat[,x]))>9) print(ks.test(hdat[ztot,x],hdat[!xtot,x]))
if (length(unique(hdat[,x]))<=9) print(wilcox.test(hdat[ztot,x],hdat[!ztot,x]))
print('===================================')}

jt=function(x) {print(x); print(tapply(hdat[,x],xtot,mean,na.rm=T)); print(fisher.test(table(xtot,hdat[,x])))}

jx('mage'); jx('fage'); jx('mht'); jx('bmi'); jx('precarem')
jx('meduc'); jx('feduc'); jx('obgest'); jx('bwt'); jx('cig0'); jx('totcig')
hdat$smoker=as.numeric(hdat$totcig>0); table(hdat$smoker,ztot)
#   ztot
#     FALSE   TRUE
#  0 216922 725523
#  1  18869  37263
100*18869/(18869+216922)
# [1] 0.08002426
100*37263/(37263+725523)
# [1] 4.885118
tapply(exp(hdat$totcig[hdat$totcig>0])-1,ztot[hdat$totcig>0],quantile)
# $`FALSE`
#   0%   25%   50%   75%  100% 
# 0.25  2.50  5.00 10.00 98.00 
#$`TRUE`
#   0%   25%   50%   75%  100% 
# 0.25  2.00  5.00 10.00 98.00 

jt('sexinf'); jt('prgbp'); jt('prgdm'); jt('wic'); jt('payer')

hdat$meth=0; hdat$feth=0 # define parental ethnicities as >=0.5
hdat$meth[hdat$aianm>=.5 | hdat$asianm>=.5 | hdat$nhpim>=.5]=3
hdat$meth[hdat$mhisp>=.5]=2; hdat$meth[hdat$aframm>=.5]=1
hdat$feth[hdat$aianf>=.5 | hdat$asianf>=.5 | hdat$nhpif>=.5]=3
hdat$feth[hdat$fhisp>=.5]=2; hdat$feth[hdat$aframf>=.5]=1
tf1=table(ztot,hdat$feth); chisq.test(tf1)
tf1; tfx=tf1[1,]/apply(tf1,2,sum); tfx[2:4]/tfx[1]
for (i in 2:4) print(fisher.test(tf1[,c(1,i)])$estimate)
tm1=table(ztot,hdat$meth); chisq.test(table(ztot,hdat$meth))
tm1; tmx=tm1[1,]/apply(tm1,2,sum); tmx[2:4]/tmx[1]
for (i in 2:4) print(fisher.test(tm1[,c(1,i)])$estimate)

library(psych); cohen.kappa(table(hdat$meth),hdat$feth) # 0.62
cohen.kappa(table(hdat$meth[ztot]),hdat$feth[ztot]) # 0.72
cohen.kappa(table(hdat$meth[!ztot]),hdat$feth[!ztot]) # 0.29

t1=table(ztot,hdat$totcig>0); t1; t1[1,2]/sum(t1[1,]); t1[2,2]/sum(t1[2,]); fisher.test(t1)
# ethnicity x smoking
for (i in 1:3) {eok=hdat$feth==i; t11=table(hdat$totcig[eok],ztot[eok]>0); print(fisher.test(t11)$estimate)}
for (i in 1:3) {eok=hdat$feth==i; t11=table(hdat$totcig[eok],ztot[eok]>0); print(fisher.test(t11)$estimate)}
sok=hdat$totcig>0; tapply(round(exp(hdat$totcig[sok]),1),ztot[sok],quantile,c(.5,.25,.75),na.rm=T)
ks.test(totcig[sok & ztot], totcig[sok &!ztot])

t1=table(ztot,hdat$wic>0); t1; t1[1,2]/sum(t1[1,]); t1[2,2]/sum(t1[2,]); fisher.test(t1)
for (i in 1:3) {eok=hdat$feth==i; t11=table(ztot[eok],hdat$wic[eok]>0); print(fisher.test(t11)$estimate)}
for (i in 1:3) {eok=hdat$feth==i; t11=table(ztot[eok],hdat$wic[eok]>0); print(fisher.test(t11)$estimate)}
t1=table(ztot,hdat$payer>0); t1; t1[1,2]/sum(t1[1,]); t1[2,2]/sum(t1[2,]); fisher.test(t1)
t1=table(ztot,hdat$prgbp>0); t1; t1[1,2]/sum(t1[1,]); t1[2,2]/sum(t1[2,]); fisher.test(t1)
t1=table(ztot,hdat$prgdm>0); t1; t1[1,2]/sum(t1[1,]); t1[2,2]/sum(t1[2,]); fisher.test(t1)
t1=table(ztot,hdat$bwt<2500); t1; t1[1,2]/sum(t1[1,]); t1[2,2]/sum(t1[2,]); fisher.test(t1)
t1=table(ztot,hdat$obgest<36); t1; t1[1,2]/sum(t1[1,]); t1[2,2]/sum(t1[2,]); fisher.test(t1)

# now recall jdat which is defined above as cases for Mplus output
dim(jdat)[1] # 785879 = same as Mplus output
aok2=hdat$X%in%jdat$X; table(aok2) # same as Mplus output
#  aok2
#   FALSE   TRUE 
#  613335 785879 

jdat=hdat[ztot,]; table(jdat$feth)
# TO CONSTRUCT TABLE 1 IN MAIN REPORT
t1=table(jdat$totcig>0,jdat$meth)
t1; round(100*t1[2,]/(t1[1,]+t1[2,]),2)
tapply(exp(jdat$totcig[jdat$totcig>0]),jdat$meth[jdat$totcig>0], quantile,c(.25,.5,.75))
tapply(jdat$meduc,jdat$meth,quantile,c(.25,.5,.75))
tapply(jdat$feduc,jdat$feth,quantile,c(.25,.5,.75))
tapply(jdat$mage,jdat$meth,quantile,c(.25,.5,.75))
tapply(jdat$fage,jdat$feth,quantile,c(.25,.5,.75))
tapply(jdat$mht,jdat$meth,quantile,c(.25,.5,.75))
tapply(jdat$bmi,jdat$meth,quantile,c(.25,.5,.75))
t1=table(jdat$wic>0,jdat$meth)
t1; round(100*t1[2,]/(t1[1,]+t1[2,]),2)
t1=table(jdat$payer,jdat$meth)
t1; round(100*t1[2,]/(t1[1,]+t1[2,]),2)
tapply(jdat$precarem,jdat$meth,quantile,c(.25,.5,.75))
t1=table(jdat$prgbp==1,jdat$meth)
t1; round(100*t1[2,]/(t1[1,]+t1[2,]),2)
t1=table(jdat$prgdm==1,jdat$meth)
t1; round(100*t1[2,]/(t1[1,]+t1[2,]),2)
tapply(jdat$obgest,jdat$meth,quantile,c(.25,.5,.75))
tapply(jdat$bwt,jdat$meth,quantile,c(.25,.5,.75))

jdat$jfac6=0 # compute factor scores directly using the loadings and each case's data
jdat$jfac6=-.668*scale((jdat$fage-30)/5)-.81*scale(jdat$feduc)
jdat$jfac6=jdat$jfac6-1.025*scale(jdat$meduc)
jdat$jfac6=jdat$jfac6-.117*scale((jdat$mht-1.65)*10)
jdat$jfac6=jdat$jfac6-.076*scale((jdat$bmi-25)/5)
jdat$jfac6=jdat$jfac6-1.517*scale(jdat$wic)
jdat$jfac6=scale(jdat$jfac6)
xl='estimated deprivation score'
plot(density(jdat$jfac6[jdat$meth==0]), xlim=c(-4,4), ylim=c(0,.6), xlab=xl, lwd=1.5, main='')
abline(v=c(-1,1),lty=2)
lines(density(jdat$jfac6[jdat$meth==1]), lty=2, lwd=1.5)
lines(density(jdat$jfac6[jdat$meth==2]), lty=3, lwd=2)
lines(density(jdat$jfac6[jdat$meth==3]), lty=4, lwd=1.5)
leg=c('Maternal ethnicity','non-Hispanic European','African American','Hispanic','Other')
legend('topright', lty=c(0:4), legend=leg, cex=.75)
##############################################################################################
##############################################################################################

tb1=table(jdat$bwt<2500,jdat$meth,jdat$sexinf)
tb1; stb1=tb1[2,,1]/(tb1[1,,1]+tb1[2,,1]); tb1[2,,2]/(tb1[1,,2]+tb1[2,,2])
stb1f=(tb1[2,,1]/(tb1[1,,1]+tb1[2,,1]))/(tb1[2,1,1]/(tb1[1,1,1]+tb1[2,1,1]))
stb1m=(tb1[2,,2]/(tb1[1,,2]+tb1[2,,2]))/(tb1[2,1,2]/(tb1[1,1,2]+tb1[2,1,2]))

# functions to compute probabilities from probit path coefficients in the Mplus outputs

# 1) compute probabilities for probits of binary outcomes
px=function(x,y=NULL){ w=1
if (x%in%c('age','bmi')) w=.2; if (x=='mht') w=.1; if (x=='socadv') w=-1;
x=toupper(x); z=paste(toupper(y),'$1',sep=''); y=paste(toupper(y),'ON',sep='.')
a=m1$est[m1[,1]==y & m1[,2]==x]; b=m1$est[m1[,2]==z]
if (x%in%c('FEDUC','MEDUC')) {
t3=m1$est[m1[,1]=='Thresholds' & m1[,2]==paste(x,'3',sep='$')]
t4=m1$est[m1[,1]=='Thresholds' & m1[,2]==paste(x,'4',sep='$')]
w=(pnorm(t4)-pnorm(t3))}
return(pnorm(a*w-b)-pnorm(-b))}

# level 3 = High school graduate or GED completed
# Threshold 3
# level 4 = Some college credit, but not a degree
# Threshold 4
# level 5 = Associate degree (AA,AS)

# excerpt from Mplus manual
# Consider the following model:
# x -> u -> y
# where u is a categorical variable. The issue is how is u treated when it is
# a dependent variable predicted by x and how is it treated when it is an
# independent variable predicting y. With weighted least squares
# estimation, a probit regression coefficient is estimated in the regression
# of u on x. In the regression of y on u, the continuous latent response
# variable u* is used as the covariate.

# prgbp on meduc b=0.036; meduc$3=-1.408; meduc$4=-.498
# P (u = 0 | x) = F (t 1 - b*x),
# P (u = 1 | x) = F (t 2 - b*x) - F (t 1 - b*x),
# P (u = 2 | x) = F (- t 2 + b*x)

# so, now need to predict meduc using cig0, aframm, etc.
# and need to predict feduc using fage, aframf, etc.

# Compute probability for increasing educational level from school->college
edy=function(x,y=NULL){ w=1; yt=toupper(y)
if (x%in%c('age','bmi')) w=.2; if (x=='mht') w=.1; if (x=='socadv') w=-1;
x=toupper(x); y=paste(toupper(y),'ON',sep='.')
a=m1$est[m1[,1]==y & m1[,2]==x]
if (y%in%c('FEDUC.ON','MEDUC.ON')) {
t3=m1$est[m1[,1]=='Thresholds' & m1[,2]==paste(yt,'3',sep='$')]
t4=m1$est[m1[,1]=='Thresholds' & m1[,2]==paste(yt,'4',sep='$')]}
return(pnorm(t4+a*w)-pnorm(t3+a*w)-(pnorm(t4)-pnorm(t3)))}

# compute probability for education (increasing school->college) as loading
edx=function(x,y=NULL){ w=1
x=toupper(x); y=paste(toupper(y),'BY',sep='.')
a=m1$est[m1[,1]==y & m1[,2]==x]
if (x%in%c('FEDUC','MEDUC')) {
t3=m1$est[m1[,1]=='Thresholds' & m1[,2]==paste(x,'3',sep='$')]
t4=m1$est[m1[,1]=='Thresholds' & m1[,2]==paste(x,'4',sep='$')]}
return((pnorm(t4+a)-pnorm(t3+a))*(pnorm(t4)-pnorm(t3)))}

# compute probability for education (increasing school->college) as 'x' variable
pedx=function(x,y=NULL){ w=1
x=toupper(x); z=paste(toupper(y),'$1',sep=''); y=paste(toupper(y),'ON',sep='.')
a=m1$est[m1[,1]==y & m1[,2]==x]; b=m1$est[m1[,2]==z]
if (x%in%c('FEDUC','MEDUC')) {
t3=m1$est[m1[,1]=='Thresholds' & m1[,2]==paste(x,'3',sep='$')]
t4=m1$est[m1[,1]=='Thresholds' & m1[,2]==paste(x,'4',sep='$')]
w=pnorm(t4)-pnorm(t3)}
return(pnorm(a*w-b)-pnorm(-b))}

# read Mplus output file
library(MplusAutomation)
m1=readModels('/home/jonathan/bayes_deponly_fac6_final_020222f2_ind.out',what='parameters')$parameters
# alternatively, read in the "Surface" model's table of results
#m1=readModels('/home/jonathan/WLSMV_deponly_surface_final_291221f2.out',what='parameters')$parameters
str(m1); m1=m1[[1]][[1]] # use only table of unstandardized path coefficients

tapply(jdat$bwt[aok & jdat$jfac6<0],jdat$meth[aok & jdat$jfac6<0],quantile)
jvs=c('bwt','meth','feth','jfac6','obgest','prgbp','prgdm','totcig','bmi','mht')
gdat=jdat[aok,jvs]
library(gamlss)
g1=gamlss(bwt~(as.factor(meth)+as.factor(feth))*jfac6, data=gdat, family=BCPE)
g2=update(g1, ~.+I(obgest-39)*(as.factor(meth)+as.factor(feth)+totcig+jfac6))
g3=update(g2, ~.+jfac6*(prgbp+prgdm)*obgest)
summary(g1); summary(g2)

ex=function(x) {print(x); print(tapply(hdat[aok,x],hdat$jeth[aok],mean))}
em=function(x) {print(x); print(tapply(hdat[aok,x],hdat$jeth[aok],median))}
et=function(x) {print(x); print(table(hdat$jeth,hdat[aok,x]))}

tapply(hdat$mage,aok,mean); t.test(hdat$mage[aok],hdat$mage[!aok])
tapply(hdat$fage,aok,mean); t.test(hdat$fage[aok],hdat$fage[!aok])

# check nature of Bayesian estimates of factor scores
fdat2=read.table('/home/jonathan/mplus/natality/bayes_fac6_fscores_291221f.dat'); dim(fdat2)
fdat1=read.table('/home/jonathan/mplus/natality/bayes_fac6_fscores_021221.dat'); dim(fdat1)
fdat=read.table('/home/jonathan/mplus/natality/bayes_fac6_fscores_071221.dat'); dim(fdat)
names(fdat)=c('feduc','payer','meduc','wic','prgbp','prgdm','sexinf','fage','mage','mht',
'bmi','cig0','precarem','obgest','bwt','aframf','fhisp','aframm','mhisp','othm','othf',
'depx','depm','depsd','dep97','dep25')

fdat$depx=fdat$depx*-1; fdat$depm=fdat$depm*-1
fdat$ethm=0; fdat$ethf=0
fdat$ethm[fdat$aframm>=.5]=1; fdat$ethm[fdat$mhisp>=.5]=2; fdat$ethm[fdat$othm>=.5]=3
fdat$ethf[fdat$aframf>=.5]=1; fdat$ethf[fdat$fhisp>=.5]=2; fdat$ethf[fdat$othf>=.5]=3
write.csv(fdat, file='/home/jonathan/mplus/natality/bayes_fac6_fscores_071221.csv')


boxplot(fdat$depm~fdat$ethm, col='lightgrey', notch=T, varwidth=T)
boxplot(fdat$depm~fdat$ethm*fdat$wic, col='lightgrey', notch=T, varwidth=T)
boxplot(fdat$depm~fdat$ethf, col='lightgrey', notch=T, varwidth=T)
boxplot(fdat$depm~fdat$ethf*fdat$wic, col='lightgrey', notch=T, varwidth=T)

library(gamlss)
frows=1:dim(fdat)[1]; cok=frows%%50==0; fdatt=fdat[cok,]
g1=gamlss(depm~as.factor(ethm)+as.factor(ethf), data=fdatt); summary(g1)
g2=gamlss(depm~as.factor(ethm)+as.factor(ethf), data=fdatt, family=SHASH, n.cyc=1e3)
summary(g2)
g2a=gamlss(depm~as.factor(ethm)+as.factor(ethf), data=fdatt, family=SHASH, n.cyc=1e3, sigma.fo=~as.factor(ethm)+as.factor(ethf), nu.fo=~as.factor(ethm)+as.factor(ethf))
summary(g2a)
g2b=gamlss(depm~as.factor(ethm)+as.factor(ethf), data=fdatt, family=SHASH, n.cyc=1e3, sigma.fo=~as.factor(ethm)+as.factor(ethf), nu.fo=~as.factor(ethm)+as.factor(ethf), tau.fo=~as.factor(ethm)+as.factor(ethf))
summary(g2b)

LR.test(g2a,g2b); plot (g2b)
# the plot.gamlss function plots residuals (ordinate) against sequence of observations (abscissa)
# note the remarkable variation across sequence of observations for both mean and variance
# this implies that the sequence reflects systematic variation - probably geographical (e.g. state)

g2c=gamlss(depm~as.factor(ethm)+as.factor(ethf)+cs(frows[cok]), data=fdatt, family=SHASH, n.cyc=1e3, sigma.fo=~as.factor(ethm)+as.factor(ethf)+cs(frows[cok]), nu.fo=~as.factor(ethm)+as.factor(ethf)+cs(frows[cok]), tau.fo=~as.factor(ethm)+as.factor(ethf))
summary(g2b)
LR.test(g2b,g2c)
#      Null model: deviance= 40837.12 with  28 deg. of freedom 
# Altenative model: deviance= 40527.02 with  52.57679 deg. of freedom 
# LRT = 310.093 with 24.57679 deg. of freedom and p-value= 0 

plot(density(fdat$depm[fdat$ethm==0]),main='',xlab='deprivation score',ylim=c(0,.5))
lines(density(fdat$depm[fdat$ethm==1]),main='',xlab='deprivation score',lty=5)
lines(density(fdat$depm[fdat$ethm==2]),main='',xlab='deprivation score',lty=2)
lines(density(fdat$depm[fdat$ethm==3]),main='',xlab='deprivation score',lty=4)
legend('topleft',lty=c(1,5,2,4),legend=c('NHW','AA','Hisp','Oth'))
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################
######################################################################################################