dat<- readxl::read_excel("maty_znaty.xlsx")
maty <- dplyr::select(dat, Id, GebJahr, Aufnjahr, verb, Sex, Nationality, Variety, Wohnort, Match, verb3, Verb_Bin, verb2, lexem)

# We need to transform the columns into factors, otherwise we will get level NULL----------------------------------------------------

maty$Match <- factor(maty$Match)
maty$Variety<- factor(maty$Variety)
maty$Nationality<- factor(maty$Nationality)
maty$Sex<- factor(maty$Sex)
maty$GebJahr<- maty$GebJahr
maty$ID <- factor(maty$Id)
maty$verb_form <- factor(maty$verb3)
maty$Verb_Bin <- factor(maty$Verb_Bin)
maty$lexem <- factor(maty$lexem)
maty$jahr <- maty$GebJahr - 1900
maty$Age <- 2020-maty$GebJahr
maty$livingplace <- factor(maty$Wohnort)
maty$verb2 <- factor(maty$verb2)
str(maty$ID)
##  Factor w/ 37 levels "AB1990","AH1953",..: 9 31 31 31 31 29 1 15 1 6 ...
table(maty$verb_form, maty$Variety)
##      
##       LEM SLO TRA
##   ma   66   2   2
##   mae  35  22  42
##   mat   3   9   0
##setting contrasts 
maty <- maty %>%
mutate(verb_form = relevel(verb_form, ref = "mae")) # we can rescale the variables. This allows us to get more intuitive results

maty <- maty %>%
mutate(Variety = relevel(Variety, ref = "TRA")) # we can rescale the variables. This allows us to get more intuitive results



## first regression 
multinomtest <- multinom(verb_form ~ Variety  + Age + Sex 
                       , data = maty, maxit=500, trace =T)
## # weights:  18 (10 variable)
## initial  value 198.848824 
## iter  10 value 107.046896
## iter  20 value 106.246702
## final  value 106.239573 
## converged
multinomtest_null <- multinom(verb_form ~ 1, data = maty, maxit = 500)
## # weights:  6 (2 variable)
## initial  value 198.848824 
## iter  10 value 158.797551
## iter  10 value 158.797551
## iter  10 value 158.797551
## final  value 158.797551 
## converged
summary(multinomtest)
## Call:
## multinom(formula = verb_form ~ Variety + Age + Sex, data = maty, 
##     maxit = 500, trace = T)
## 
## Coefficients:
##     (Intercept) VarietyLEM VarietySLO         Age       Sexm
## ma    -1.783955   4.104848  0.8844131 -0.02354656 -1.1490835
## mat  -10.867943   7.304500  9.0196051  0.01362554  0.4973822
## 
## Std. Errors:
##     (Intercept) VarietyLEM VarietySLO         Age      Sexm
## ma    0.8493965  0.7825685  1.0497712 0.008847964 0.4871903
## mat   0.7744550  0.6431493  0.4288724 0.015994630 0.6935965
## 
## Residual Deviance: 212.4791 
## AIC: 232.4791
summary(multinomtest_null) #Null model: The higher Residual Deviance & Akaike Information Criterion values confirm, that our Model including the factors is "better" than the "null model". Our factors indeed tell us something! 
## Call:
## multinom(formula = verb_form ~ 1, data = maty, maxit = 500)
## 
## Coefficients:
##     (Intercept)
## ma   -0.3466246
## mat  -2.1102130
## 
## Std. Errors:
##     (Intercept)
## ma    0.1561625
## mat   0.3056703
## 
## Residual Deviance: 317.5951 
## AIC: 321.5951
#squared test - are the assumptions met?
multinomtest_q <- multinom(verb_form ~ Variety^2 + Age^2 + Sex^2
                         , data = maty, maxit=500, trace =T)
## # weights:  18 (10 variable)
## initial  value 198.848824 
## iter  10 value 107.046896
## iter  20 value 106.246702
## final  value 106.239573 
## converged
lrtest(multinomtest,multinomtest_q) ## is the linear model better than the square model?
## Likelihood ratio test
## 
## Model 1: verb_form ~ Variety + Age + Sex
## Model 2: verb_form ~ Variety^2 + Age^2 + Sex^2
##   #Df  LogLik Df Chisq Pr(>Chisq)
## 1  10 -106.24                    
## 2  10 -106.24  0     0          1
# get p-values 

z1 <- summary(multinomtest)$coefficients/summary(multinomtest)$standard.errors

p1 <- (1 - pnorm(abs(z1), 0, 1)) * 2

print(p1) # some significant values
##     (Intercept)   VarietyLEM VarietySLO         Age       Sexm
## ma   0.03570587 1.559829e-07  0.3995184 0.007785342 0.01834438
## mat  0.00000000 0.000000e+00  0.0000000 0.394279463 0.47330870
## is the model good enough?
set.seed(42)

totalAccuracy <- c()
cv <- 10
cvDivider <- floor(nrow(maty) / (cv+1))

for (cv in seq(1:cv)) {
  # assign chunk to data test
  dataTestIndex <- c((cv * cvDivider):(cv * cvDivider + cvDivider))
  dataTest <- maty[dataTestIndex,]
  # everything else to train
  dataTrain <- maty[-dataTestIndex,]
 
  matyModel <- multinom(verb_form~ Variety  + Age + Sex, data=dataTrain, maxit=1000, trace=T) 
 
  pred <- predict(matyModel, newdata=dataTest, type="class")
 
  #  classification error
  cv_ac <- postResample(dataTest$verb_form, pred)[[1]]
  print(paste('Current Accuracy:',cv_ac,'for CV:',cv))
  totalAccuracy <- c(totalAccuracy, cv_ac)
}
## # weights:  18 (10 variable)
## initial  value 180.172415 
## iter  10 value 100.546216
## iter  20 value 99.467207
## iter  30 value 99.463783
## iter  40 value 99.462755
## final  value 99.462222 
## converged
## [1] "Current Accuracy: 0.823529411764706 for CV: 1"
## # weights:  18 (10 variable)
## initial  value 180.172415 
## iter  10 value 102.401314
## iter  20 value 101.835566
## iter  30 value 101.833494
## iter  30 value 101.833494
## iter  30 value 101.833494
## final  value 101.833494 
## converged
## [1] "Current Accuracy: 1 for CV: 2"
## # weights:  18 (10 variable)
## initial  value 180.172415 
## iter  10 value 90.762844
## iter  20 value 89.028825
## iter  30 value 89.025568
## iter  30 value 89.025568
## iter  30 value 89.025568
## final  value 89.025568 
## converged
## [1] "Current Accuracy: 0.411764705882353 for CV: 3"
## # weights:  18 (10 variable)
## initial  value 180.172415 
## iter  10 value 93.247949
## iter  20 value 92.679254
## final  value 92.673811 
## converged
## [1] "Current Accuracy: 0.0588235294117647 for CV: 4"
## # weights:  18 (10 variable)
## initial  value 180.172415 
## iter  10 value 99.703299
## iter  20 value 99.246529
## final  value 99.243575 
## converged
## [1] "Current Accuracy: 0.588235294117647 for CV: 5"
## # weights:  18 (10 variable)
## initial  value 180.172415 
## iter  10 value 100.159254
## iter  20 value 99.530762
## iter  30 value 99.525761
## iter  40 value 99.524051
## final  value 99.523944 
## converged
## [1] "Current Accuracy: 0.764705882352941 for CV: 6"
## # weights:  18 (10 variable)
## initial  value 180.172415 
## iter  10 value 95.104120
## iter  20 value 94.372930
## final  value 94.366807 
## converged
## [1] "Current Accuracy: 0.294117647058824 for CV: 7"
## # weights:  18 (10 variable)
## initial  value 180.172415 
## iter  10 value 99.626880
## iter  20 value 99.303160
## final  value 99.300351 
## converged
## [1] "Current Accuracy: 0.764705882352941 for CV: 8"
## # weights:  18 (10 variable)
## initial  value 180.172415 
## iter  10 value 95.867410
## iter  20 value 95.195839
## final  value 95.192219 
## converged
## [1] "Current Accuracy: 0.588235294117647 for CV: 9"
## # weights:  18 (10 variable)
## initial  value 180.172415 
## iter  10 value 95.518621
## iter  20 value 94.817764
## final  value 94.812168 
## converged
## [1] "Current Accuracy: 0.705882352941177 for CV: 10"
mean(totalAccuracy)
## [1] 0.6
# bootstrapping the regression model 

# predefine function

boot_function  <- function(formula, data, indices) {
  d <- data[indices,] # allows boot to select sample
  fit <- multinom(formula, data=d, iter=50, trace=F)#we can adjust the iteration rate here
  return(coef(fit))
}
multinomtest_boot<- boot(data = maty, boot_function, formula= verb_form ~  Variety + Age + Sex, R = 500)

summary(multinomtest_boot)
## 
## Number of bootstrap replications R = 500 
##      original   bootBias   bootSE    bootMed
## 1   -1.783955 -1.1505461 3.384396  -1.847782
## 2  -10.867943 -3.2715609 4.358584 -14.242189
## 3    4.104848  1.2626986 3.366205   4.257823
## 4    7.304500  1.8538917 3.572635   9.702771
## 5    0.884413 -0.3049700 5.972247   0.898129
## 6    9.019605  2.6819001 2.130334  12.235327
## 7   -0.023547 -0.0013401 0.010180  -0.024143
## 8    0.013626  0.0060114 0.050931   0.015289
## 9   -1.149084 -0.0296250 0.511579  -1.191424
## 10   0.497382  0.2457120 2.101512   0.543677
z2 <- summary(multinomtest_boot)$bootMed/summary(multinomtest_boot)$bootSE
p2 <- (1 - pnorm(abs(z2), 0, 1)) * 2
summary(multinomtest)
## Call:
## multinom(formula = verb_form ~ Variety + Age + Sex, data = maty, 
##     maxit = 500, trace = T)
## 
## Coefficients:
##     (Intercept) VarietyLEM VarietySLO         Age       Sexm
## ma    -1.783955   4.104848  0.8844131 -0.02354656 -1.1490835
## mat  -10.867943   7.304500  9.0196051  0.01362554  0.4973822
## 
## Std. Errors:
##     (Intercept) VarietyLEM VarietySLO         Age      Sexm
## ma    0.8493965  0.7825685  1.0497712 0.008847964 0.4871903
## mat   0.7744550  0.6431493  0.4288724 0.015994630 0.6935965
## 
## Residual Deviance: 212.4791 
## AIC: 232.4791
summary(multinomtest_boot)
## 
## Number of bootstrap replications R = 500 
##      original   bootBias   bootSE    bootMed
## 1   -1.783955 -1.1505461 3.384396  -1.847782
## 2  -10.867943 -3.2715609 4.358584 -14.242189
## 3    4.104848  1.2626986 3.366205   4.257823
## 4    7.304500  1.8538917 3.572635   9.702771
## 5    0.884413 -0.3049700 5.972247   0.898129
## 6    9.019605  2.6819001 2.130334  12.235327
## 7   -0.023547 -0.0013401 0.010180  -0.024143
## 8    0.013626  0.0060114 0.050931   0.015289
## 9   -1.149084 -0.0296250 0.511579  -1.191424
## 10   0.497382  0.2457120 2.101512   0.543677
print(p1)
##     (Intercept)   VarietyLEM VarietySLO         Age       Sexm
## ma   0.03570587 1.559829e-07  0.3995184 0.007785342 0.01834438
## mat  0.00000000 0.000000e+00  0.0000000 0.394279463 0.47330870
print(p2)
##  [1] 5.850858e-01 1.084563e-03 2.059168e-01 6.610408e-03 8.804619e-01
##  [6] 9.280241e-09 1.771265e-02 7.640260e-01 1.986362e-02 7.958610e-01
citation("boot")
## 
## To cite the 'boot' package in publications use:
## 
##   Angelo Canty and Brian Ripley (2020). boot: Bootstrap R (S-Plus)
##   Functions. R package version 1.3-25.
## 
##   Davison, A. C. & Hinkley, D. V. (1997) Bootstrap Methods and Their
##   Applications. Cambridge University Press, Cambridge. ISBN
##   0-521-57391-2
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
citation("car")
## 
## To cite the car package in publications use:
## 
##   John Fox and Sanford Weisberg (2019). An {R} Companion to Applied
##   Regression, Third Edition. Thousand Oaks CA: Sage. URL:
##   https://socialsciences.mcmaster.ca/jfox/Books/Companion/
## 
## A BibTeX entry for LaTeX users is
## 
##   @Book{,
##     title = {An {R} Companion to Applied Regression},
##     edition = {Third},
##     author = {John Fox and Sanford Weisberg},
##     year = {2019},
##     publisher = {Sage},
##     address = {Thousand Oaks {CA}},
##     url = {https://socialsciences.mcmaster.ca/jfox/Books/Companion/},
##   }
citation("caret")
## 
## To cite package 'caret' in publications use:
## 
##   Max Kuhn (2020). caret: Classification and Regression Training. R
##   package version 6.0-86. https://CRAN.R-project.org/package=caret
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {caret: Classification and Regression Training},
##     author = {Max Kuhn},
##     year = {2020},
##     note = {R package version 6.0-86},
##     url = {https://CRAN.R-project.org/package=caret},
##   }
citation("dplyr")
## 
## To cite package 'dplyr' in publications use:
## 
##   Hadley Wickham, Romain François, Lionel Henry and Kirill Müller
##   (2020). dplyr: A Grammar of Data Manipulation. R package version
##   1.0.1. https://CRAN.R-project.org/package=dplyr
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {dplyr: A Grammar of Data Manipulation},
##     author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller},
##     year = {2020},
##     note = {R package version 1.0.1},
##     url = {https://CRAN.R-project.org/package=dplyr},
##   }
citation("lmtest")
## 
## To cite lmtest in publications use:
## 
##   Achim Zeileis, Torsten Hothorn (2002). Diagnostic Checking in
##   Regression Relationships. R News 2(3), 7-10. URL
##   https://CRAN.R-project.org/doc/Rnews/
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Diagnostic Checking in Regression Relationships},
##     author = {Achim Zeileis and Torsten Hothorn},
##     journal = {R News},
##     year = {2002},
##     volume = {2},
##     number = {3},
##     pages = {7--10},
##     url = {https://CRAN.R-project.org/doc/Rnews/},
##   }
citation("nnet")
## 
## To cite the nnet package in publications use:
## 
##   Venables, W. N. & Ripley, B. D. (2002) Modern Applied Statistics with
##   S. Fourth Edition. Springer, New York. ISBN 0-387-95457-0
## 
## A BibTeX entry for LaTeX users is
## 
##   @Book{,
##     title = {Modern Applied Statistics with S},
##     author = {W. N. Venables and B. D. Ripley},
##     publisher = {Springer},
##     edition = {Fourth},
##     address = {New York},
##     year = {2002},
##     note = {ISBN 0-387-95457-0},
##     url = {http://www.stats.ox.ac.uk/pub/MASS4},
##   }
citation("plyr")
## 
## To cite plyr in publications use:
## 
##   Hadley Wickham (2011). The Split-Apply-Combine Strategy for Data
##   Analysis. Journal of Statistical Software, 40(1), 1-29. URL
##   http://www.jstatsoft.org/v40/i01/.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {The Split-Apply-Combine Strategy for Data Analysis},
##     author = {Hadley Wickham},
##     journal = {Journal of Statistical Software},
##     year = {2011},
##     volume = {40},
##     number = {1},
##     pages = {1--29},
##     url = {http://www.jstatsoft.org/v40/i01/},
##   }
citation("tidyverse")
## 
##   Wickham et al., (2019). Welcome to the tidyverse. Journal of Open
##   Source Software, 4(43), 1686, https://doi.org/10.21105/joss.01686
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Welcome to the {tidyverse}},
##     author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani},
##     year = {2019},
##     journal = {Journal of Open Source Software},
##     volume = {4},
##     number = {43},
##     pages = {1686},
##     doi = {10.21105/joss.01686},
##   }
citation("xlsx")
## 
## To cite package 'xlsx' in publications use:
## 
##   Adrian Dragulescu and Cole Arendt (2020). xlsx: Read, Write, Format
##   Excel 2007 and Excel 97/2000/XP/2003 Files. R package version 0.6.3.
##   https://CRAN.R-project.org/package=xlsx
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {xlsx: Read, Write, Format Excel 2007 and Excel 97/2000/XP/2003 Files},
##     author = {Adrian Dragulescu and Cole Arendt},
##     year = {2020},
##     note = {R package version 0.6.3},
##     url = {https://CRAN.R-project.org/package=xlsx},
##   }