# ------------------------------------------------------------------------------
# -------- Übung zur Vorlesung Generalisierte Regressionsmodelle ---------------
# -------- R Code / Blatt 2                                      ---------------
# ------------------------------------------------------------------------------


# -------- Aufgabe 1 -----------------------------------------------------------

## zusatzpakete
library(reshape2)
library(gridExtra)
library(ggplot2)
theme_set(theme_bw())

# datensatz von der homepage laden
shuttle <- read.table("shuttle.asc", header = TRUE)
# temperatur in Celsius hinzufügen
shuttle$tempC <- (shuttle$temp - 32)/1.8
shuttle


# a)
# summary, boxplot getrennt nach td (0/1)
summary(shuttle$temp[shuttle$td==0])
summary(shuttle$temp[shuttle$td==1])
boxplot(temp ~ td, data = shuttle, xlab="Thermische Überbeanspruchung (0 = nein / 1 = ja)",ylab="Temperatur")

# Alternativ mit ggplot
#gg.shuttle <- ggplot(shuttle)
#grid.arrange(
#        gg.shuttle + geom_boxplot(aes(x = factor(td), y = temp)),
#        gg.shuttle + geom_violin(aes(x = factor(td), y = temp)),
#        nrow = 1)

# Vorsicht! Eine derartige Darstellung mag den Eindruck erwecken, dass es sich 
# bei temp um die abhängige Variable handelt und bei td um die erklärende.
# In Hinblick auf evtl. Kausalität ist dies offensichtlicher Unsinn.


# b)
# lineares Modell
lm.f <- lm(td ~ temp, data = shuttle)
# nun zur Verfuegung stehende Komponenten des lm-Objekts
names(lm.f)
# Streudiagramm td gegen temp
plot(td ~ temp, data = shuttle, lwd = 2, 
        ylim = range(lm.f$fitted.values) + c(-0.1, 0.1), 
        col = rgb(0,0,0, alpha = 0.8), las = 1)
# gefittete Werte eintragen
abline(lm.f$coef,lwd=2) # Spezialfall für lineares Modell, 
# allgemeiner z.B. mit
points(shuttle$temp, lm.f$fitted.values, col = rgb(1,0,0, alpha = 0.5), lwd = 2) 
# oder
## lines(shuttle$temp,lm.f$fitted.values,col=2,lwd=2) # bei nicht-geordneten temp nicht so gut


# c)
# per hand
beta0.c <- lm.f$coefficients[1] + 32 * lm.f$coefficients[2]
beta1.c <- 1.8 * lm.f$coefficients[2]
# lm fuer Celsius
lm.c <- lm(td ~ tempC, data = shuttle)
# vergleich
c(beta0.c, beta1.c)
coefficients(lm.c)

# gleichzeitige Aufnahme von Temeratur in Fahrenheit und Celsius
summary(lm.multicol <- lm(td ~ temp + tempC, data = shuttle))


# d)
# nochmals lineares modell fuer Fahrenheit
summary(lm.f)

# Interpretation:
# - linkssteile Verteilung der Residuen (betrachte Quantile!),
#   links von 0 offenbar mehr Masse als rechts.
plot(density(lm.f$residuals), main = "Residuen", las = 1)
# - beide Parameter signifikant von 0 verschieden (insb. linearer Term), 
#   aber Vorsicht (siehe unten).
# - R^2 eher gering, aber Gesamtmodell als solches ist signifikant 
#   (ebenso Vorsicht).
anova(lm(td ~ 1, data = shuttle), lm.f)

# linearer Term ist signifikant zum Niveau alpha = 0.01. Aber t-Test
# verwendet Normalverteilung von y = td (bzw. Approximation), und
# Normalerverteilung von epsilon bzw. y sicher nicht gegeben. 
# (Für Approximation nicht genügend Daten vorhanden und Varianzhomogenität 
# nicht erfüllt, siehe unten.)

# Gegen lineares Modell spricht z.B. (siehe auch Vorlesung):
# - geschätzte Wahrscheinlichkeiten < 0 (und > 1) möglich.
# - keine Varianzhomogenität wegen Var(y) = p(1-p) bei binärem y.


# e) 
# Logit-Modell: h(.) ist logistische Funktion
glm.logit <- glm(td ~ temp, data = shuttle, family=binomial) 

# Probit-Modell: h(.) ist cdf von N(0,1)
glm.probit <- update(glm.logit, family = binomial(link = probit)) 

# Komplemetäres Log-log-Modell: h(.) ist cdf der Gompertz-Vtlg (nicht symmetrisch!)
glm.cloglog <- update(glm.logit, family = binomial(link = cloglog)) 

glm.list <- list(logit = glm.logit, probit = glm.probit, cloglog = glm.cloglog)

# Summaries
lapply(glm.list, summary)

# Plots:
# daten
plot(td ~ temp, data = shuttle)
# geschaetzte Wahrscheinlichkeiten
t.index <- order(shuttle$temp)
lines(shuttle$temp[t.index], glm.logit$fitted.values[t.index], 
        type = "b", lwd = 2, col = 2)
lines(shuttle$temp[t.index], glm.probit$fitted.values[t.index], 
        type = "b", lwd = 2, col = 3)
lines(shuttle$temp[t.index],glm.cloglog$fitted.values[t.index], 
        type = "b", lwd = 2, col = 4)

# Alternativ mit ggplot
#df.fitted <- sapply(glm.list, fitted)
#mdf.fitted <- melt(df.fitted)
#mdf.fitted$temp <- rep(shuttle$temp, ncol(df.fitted))
#gg.shuttle + geom_point(aes(x = temp, y = td))+
#        geom_line(data = mdf.fitted, aes(x = temp, y = value, colour = Var2)) + 
#        geom_point(data = mdf.fitted, aes(x = temp, y = value, colour = Var2)) + 
#        scale_color_discrete(name = "link")


# g)

# von Hand:
# Logit-Modell
etalogit <- glm.logit$coef[1] + glm.logit$coef[2]*31 # Zugriff auf geschätzte Parameter
(pi.logit <- plogis(etalogit)) # plogis(): logistische Funktion

# Probit-Modell
eta.probit <- glm.probit$coef[1] + glm.probit$coef[2]*31
(pi.probit <- pnorm(eta.probit)) # pnorm(): cdf der N(0,1)-Vtlg.

# Komp. Log-log-Modell
eta.cloglog <- glm.cloglog$coef[1] + glm.cloglog$coef[2]*31
(pi.cloglog <- 1 - exp(-exp(eta.cloglog))) # vgl. Hinweis

## compare
c(pi.logit, pi.probit, pi.cloglog)

# alternativ ueber predict funktion
# predict(model, newdata)
sapply(glm.list, predict, type = "response", newdata = data.frame(temp = 31))

# Die Wahrscheinlichkeit für das Auftreten der therm. Überbeanspruchung bei 
# einer Temperatur von 31°F ist in allen Modellen nahe eins; siehe auch
# Graphik aus e).

# h) 

# exakte Lösung, Herleitung der verwendeten Formeln siehe Übung
temp.logit <- -glm.logit$coef[1]/glm.logit$coef[2]
temp.probit <- -glm.probit$coef[1]/glm.probit$coef[2]
temp.cloglog <- (log(-log(0.5)) - glm.cloglog$coef[1])/glm.cloglog$coef[2]

c(temp.logit, temp.probit, temp.cloglog)

## advanced 
# liste mit linkfunktionen
link.functions <- list(
        logit = function(pi) log(pi/(1-pi)), 
        probit = function(pi) qnorm(pi), 
        cloglog = function(pi) log(-log(1 - pi)))

link.values <- sapply(link.functions, function(f) f(0.5))
link.values
coefs <- sapply(glm.list, coefficients)
coefs
(link.values - coefs[1, ])/coefs[2, ]

## fuer Wkt von 0.9
(sapply(link.functions, function(f) f(0.9)) - coefs[1, ]) / coefs[2, ]