----------------------------------------------
#            An Algorithm to get             # 
#  'the most linear independent' Regressors  #
----------------------------------------------

# load DAX and Component data:

daxwithcomp = read.table("C:/Users/Admin/desktop/DAXwithComp.txt",header=TRUE,sep=";")
head(daxwithcomp)
tail(daxwithcomp)
names(daxwithcomp)

# let's take the last 4 years of data, 2011 - 2014:
daxwithcomp[1558,]
daxwithcomp[1559,]
daxwithcomp = daxwithcomp[-(1:1558),]
head(daxwithcomp)
tail(daxwithcomp)
dim(daxwithcomp)
# remove all rows with NA's:
daxwithcomp = na.omit(daxwithcomp)
dim(daxwithcomp)


dax = daxwithcomp[,1]  
uls = daxwithcomp[,2:31]          # `uls' for `underlyings' 
str(uls)                          # uls is dataframe

res = lm(dax ~ uls)               # doesn't work

# let's change uls from dataframe to matrix:
uls = as.matrix(uls)

res = lm(dax ~ uls)               # this works
summary(res)
plot(dax,type="l")
lines(res$fit,col=2)

# Regression without intercept:
res = lm(dax ~ -1 + uls)              
summary(res)
plot(dax,type="l")
lines(res$fit,col=2)


# we want to implement the following algorithm:
# we calculate the quantity
#
#   p_j := ||P_X(-j) x_j||^2 / ||x_j||^2 
#
# for all valid underlyings (30 at the beginning)
# and remove the underlying j for which p_j is 
# maximal. 


# let us try to encode the valid underlying by a 
# boolean vector of length 30: 
# for example, 

validuls = rep(FALSE,30)
validuls[3] = TRUE
validuls[7] = TRUE
validuls

# should mean, that we only want to use underlying 3 
# and underlying 7 for the dax-regression, how is this 
# done technically? Actually very easy:

res = lm(dax ~ uls[,validuls])
res

# or without intercept:
res = lm(dax ~ -1 + uls[,validuls])
res



#------------------------------------------------
# start algorithm:
# 

validuls = rep(TRUE,30)
namesuls = colnames(uls)
namesuls
pj = rep(0,30)

for(iter in 1:29)
{
  pj = rep(0,30)
 
  for(i in 1:30)
  {
    # use only the remaining valid underlyings:
    if(validuls[i]==TRUE)
    {
      y = uls[,i]
      # remove this y from the others:
      uls_ohne_i = validuls
      uls_ohne_i[i] = FALSE
      X = uls[,uls_ohne_i]

      #res = lm(y ~ X)
      res = lm(y ~ -1 + X)

      pj[i] = sum(res$fit^2)/sum(y^2)
    }
  }

  imax = which.max(pj)
  validuls[imax] = FALSE
  
  print(pj)
  print( paste("underlying",namesuls[imax],"has been removed.") )
  
  #res = lm( dax ~ uls[,validuls] )
  res = lm( dax ~ -1 + uls[,validuls] )

  par(mfrow=c(1,2))
  info = paste("after",iter,"iterations")
  plot(dax,type="l",main=info)
  lines(res$fit,col=2)
  plot(dax-res$fit,type="l",main=info)

  readline("press enter to show regression summary..")

  print(summary(res))
  readline("press enter to remove next underlying..")
  
}

#
# end algorithm
#------------------------------------------------




# let's put this into a function:

GetLinIndUls = function( uls , n_uls , useintercept )
{
  n = length(uls[1,])
  validuls = rep(TRUE,n)
  namesuls = colnames(uls)
  names(validuls) = namesuls

  for(iter in 1:(n-n_uls))
  {
    pj = rep(0,n)
 
    for(i in 1:n)
    {
      if( validuls[i]==TRUE )
      {
        y = uls[,i]
        uls_ohne_i = validuls
        uls_ohne_i[i] = FALSE
        X = uls[,uls_ohne_i]

        if(useintercept == FALSE)
        {  res = lm(y ~ -1 + X)  }
        else
        {  res = lm(y ~ X)  }

        pj[i] = sum(res$fit^2)/sum(y^2)
      }
    }
    imax = which.max(pj)
    validuls[imax] = FALSE
  }
  return(validuls)
}


# For some day d we look 60 days in the past and 
# apply the algorithm above to determine the 10 
# most linear independent underlyings. Then we move 
# on week by week to see how this set of underlyings 
# is changing:

dim(uls)
validulstotal = rep(0,30)
for( d in seq(from=60,to=950,by = 5) )
{
  validuls = GetLinIndUls(uls[(d-59):d,],10,TRUE)
  validulstotal = validulstotal + validuls 
  validulsnames = names(validuls)[validuls]
  print(validulsnames)
}
validulstotal