# --------------------------------------------------------------------------
# -------- Übung zur Vorlesung Generalisierte Regressionsmodelle -----------
# -------- R Code / Blatt 8                                      -----------
# --------------------------------------------------------------------------

# -------- Aufgabe 9 ----------------------------------------------------------

# lade Paket 
library("penalized")

# Daten einlesen
soep <- read.table("fakesoep.dat",header=T)

# logarithmierter response 
plot(density(log(soep$beink)))
soep$beinkLog <- log(soep$beink)

## a)
# unpenalisiertes Modell 
glmGauß <- glm(beinkLog ~ groesse + alter + dauer + verh + geschl + deutsch + abitur,data=soep,
                family=gaussian)
summary(glmGauß)
# einige nicht signifikante Effekte -> Variablenselektion erforderlich

## b)
library("e1071")
covar.combinations <- bincombinations(7)
nrow(covar.combinations)
colnames(covar.combinations) <- colnames(soep[, -1])
head(covar.combinations) # erste zeile  -> null-modell
tail(covar.combinations) # letzte zeile -> volles modell

## c)
# Schrittweise Variablenselektion basierend auf AIC-Kriterium, ausgehend vom
# vollen Modell:
fullstepml.fit <- step(glmGauß, direction = "both") # stepAIC
summary(fullstepml.fit)

# Null-Modell:
partialfit <- glm(beinkLog ~ 1, data = soep, family=gaussian)
summary(partialfit)

# Schrittweise Variablenselektion ausgehend vom Null-Modell:
# scope gibt an, bis zu welchem Modell maximal variablen aufgenommen werden
partialstepml.fit <- step(partialfit, scope = beinkLog ~ groesse + alter + dauer + 
                          verh + geschl + deutsch + abitur, direction = "both")
summary(partialstepml.fit)
                         
# Vergleich der Ergebnisse 
AIC(fullstepml.fit); AIC(partialstepml.fit)
all(names(fullstepml.fit$coefficients) %in% names(partialstepml.fit$coefficients))
all(names(partialstepml.fit$coefficients) %in% names(fullstepml.fit$coefficients))

## Fazit: hier führt schrittweise Variablenselektion ausgehend vom vollen oder
## vom Null-Modell zu denselben Ergebnissen.


## d)
# Funktion zum Plotten der Koeffizientenpfade 
myplotpath <- function (object, labelsize = 0.6, standardize = FALSE, ...) {
    
    if ( length(object) > 0 ) {
        betas <- sapply(object, coefficients, "p")
    }
    
    if ( standardize ) {
        weights <- weights(object[[1]])
        if ( length(weights) > nrow(betas) ) 
            weights <- weights[-seq_len(length(weights) - nrow(betas))]
        betas <- betas * matrix(weights, nrow = nrow(betas), ncol = ncol(betas))
    }

    remove <- apply(betas, 1, function(bet) all(bet == 0))
    if ( all(remove) ) 
        stop("all coefficients are zero for all values of lambda in this object")
    lambda <- sapply(object, function(object) object@lambda1)
    label <- expression("smoothing parameter "*lambda)
    
    if ( all(lambda == lambda[1]) ) {
        lambda <- sapply(object, function(object) object@lambda2)
        label <- expression("smoothing parameter "*lambda)
    }
    # set parameters for plot
    labwidth <- ifelse(labelsize > 0, 
        max(strwidth(rownames(betas[!remove, ]), "inches", labelsize)), 0)
    margins <- par("mai")
    par(mai = c(margins[1:3], max(margins[4], labwidth * 1.4)))
    
    # plot betas 
    matplot(lambda, t(betas[!remove, , drop = FALSE]), type = "l", 
        ylab = "estimated coefficient", xlab = label, col = "black", 
        xlim = rev(range(lambda)), lty=1, ...)

    if (labelsize > 0 && !is.null(rownames(betas))) {
        take <- which(!remove)
        for (i in 1:sum(!remove)) {
            j <- take[i]
            axis(4, at = betas[j, ncol(betas)], labels = rownames(betas)[j], 
                las = 1, cex.axis = labelsize, col.axis = "black", 
                lty = 1, col = "black")
        }
    }
    par(mai = margins)
    return(invisible(NULL))
}

# definiere variablen die penalisiert werden sollen (als formula)
# Intercept wird nicht penalisiert
penvars <- ~ groesse + alter + dauer + verh + geschl + deutsch + abitur

set.seed(12122016)
# LASSO-Modell
L1.fit <- optL1(response = beinkLog, penalized = penvars,
             standardize = TRUE, data = soep, model = "linear",
             fold = 10)
coef.L1 <- coefficients(L1.fit$fullfit) 
# fullfit enthält den aktuellen Fit. Beachte, dass fullfit ein S4-Objekt ist.
length(coef.L1)

# Überprüfe mit profL1(), ob das richtige Optimum gefunden wurde                              
prof1 <- profL1(response = beinkLog, penalized = penvars,
                standardize = TRUE, data = soep, model = "linear",
                fold = 10, steps = 100, minlambda = 1, maxlambda = 50)
                
# CV-Kriterium gegen lambda 
plot(prof1$lambda, prof1$cvl, type = "l")
abline(v = L1.fit$lambda, lty = 2)

# Koeffizientenpfade 
par(mai = c(0.8,0.8,0.1,0.8))
myplotpath(prof1$fullfit, standardize = TRUE, labelsize = 0.9)
abline(v = L1.fit$lambda, lty = 2)

# Vergleich der Koeffizienten
L1.fit$fullfit@unpenalized
L1.fit$fullfit@penalized
fullstepml.fit$coefficients

## e) ----------------- Auswertung mit Interaktionen ----------------------------- 


# a) 
# Unpenalisiertes Modell 
glmGaußIA <- glm(beinkLog ~ (groesse + alter + dauer + verh + geschl + deutsch + abitur)^2,data=soep,
               family=gaussian)
summary(glmGaußIA)

# c) 
# Schrittweise Variablenselektion basierend auf AIC-Kriterium, ausgehend vom
# vollen Modell:
fullstepml.fitIA <- step(glmGaußIA, direction = "both")
summary(fullstepml.fitIA)

# Null-Modell:
partialfitIA <- glm(beinkLog ~ 1,  data = soep, family = gaussian)
summary(partialfitIA)

# Schrittweise Variablenselektion ausgehend vom Null-Modell:
partialstepml.fitIA <- step(partialfitIA, scope = beinkLog ~ (groesse + alter + dauer + verh + geschl + deutsch + abitur)^2,
                          direction="both")
summary(partialstepml.fitIA)

# Vergleich der Erebnisse 
AIC(fullstepml.fitIA); AIC(partialstepml.fitIA)
all(names(fullstepml.fitIA$coefficients) %in% names(partialstepml.fitIA$coefficients))
all(names(partialstepml.fitIA$coefficients) %in% names(fullstepml.fitIA$coefficients))
length(coefficients(glmGaußIA))
length(coefficients(fullstepml.fitIA))
length(coefficients(partialstepml.fitIA))

## Fazit: je nach Komplexität des Modells führt schrittweise Variablenselektion
## ausgehend vom vollen oder vom Null-Modell zu unterschiedlichen Ergebnissen.

# d) 
penvars <- ~ (groesse + alter + dauer + verh + geschl + deutsch + abitur)^2

set.seed(12122016)
# LASSO-Modell
L1.fitIA <- optL1(response = beinkLog, penalized = penvars,
                standardize = TRUE, data = soep, model = "linear",
                fold = 10)
coef.L1IA <- coefficients(L1.fitIA$fullfit) 
length(coef.L1IA)

prof1IA <- profL1(response = beinkLog, penalized = penvars,
                standardize = TRUE, data = soep, model = "linear",
                fold = 10, steps = 100, minlambda = 5, maxlambda = 50)

plot(prof1IA$lambda, prof1IA$cvl, type = "l")
abline(v = L1.fitIA$lambda, lty = 2)

# Koeffizientenpfade
par(mai = c(0.8,0.8,0.1,0.8))
myplotpath(prof1IA$fullfit, standardize = TRUE, labelsize = 0.9)
abline(v = L1.fitIA$lambda, lty = 2)

# Vergleich der Koeffizienten
length(coef.L1IA)
length(fullstepml.fitIA$coefficients)
length(partialstepml.fitIA$coefficients)

L1.fit$fullfitIA@unpenalized
L1.fit$fullfitIA@penalized
fullstepml.fitIA$coefficients