R library

################################################
#### Descriptive Statistics: Main Functions ####
################################################
# mean function
# x: numeric vector containing the values for which mean is to be computed
mean.fun <- function(x){
    if(any(is.na(x)))
        x <- x[!is.na(x)];
    res <- 0;
    for(i in 1:length(x))
        res <- res + x[i];
    av  <- res/length(x);
    av;
}

# variance function
# x: numeric vector containing the values for which variance is to be computed
var.fun <- function(x){
    if(any(is.na(x)))
        x <- x[!is.na(x)];
    y <- 0;
    for(i in 1:length(x))
        y  <- y + (x[i] - mean.fun(x))^2;
    var <- y/(length(x)-1);
    var;
}

# standard deviation function
# x: numeric vector containing the values for which standard deviation is to be computed
sd.fun <- function(x){
    return(sqrt(var.fun(x)))
}

# trimmed mean function
# x: numeric vector containing the values for which trimmed mean is to be computed
# note: if trim > 0.5 the function mean of the package base of R return the median value (see help)
trimmed.mean <- function(x,trim){
    if(trim>=0.5)
        return(median(x));      #or return("error, trim must be < 0.5")  ---> maybe the better solution
    x <- sort(x);
    outlier <- floor(trim * length(x));
    trim.mean <- mean(x[(outlier+1):(length(x)-outlier)]);
    trim.mean;
}
trimmed.mean2 <- function(x,trim){
    if(trim>=0.5)
        return(median(x));      #or return("error, trim must be < 0.5")
    x <- sort(x);
    l <- length(x);
    lo <- floor(trim * length(x)) + 1;
    hi <- l - lo + 1;
    trim.mean <- mean(x[lo:hi])
    trim.mean;
}

# median function
# x: numeric vector containing the values for which median is to be computed
median.fun <- function(x){
    x <- sort(x);       #note: if the vector x contains same NA values sort function removes it 
    n <- length(x);
    r <- n%%2;
    if(r==0){
        y <- 0.5 * (x[n/2] + x[n/2+1])
    }else{
        y <- x[(n+1)/2]
    }
    y;
}

# percentile function: Nearest Rank Method
# x: numeric vector containing the values for which percentile is to be computed
# prob: probability (value in [0,1]) for which quantile is wanted
quantile.fun <- function(x,prob){
    if(any(prob > 1))
        return("stop, prob must be < 1");
    if(any(prob==0)){
        y <- min(x);
        names(y) <- "0%";
    }else{
        x <- sort(x);
        n <- length(x);
        p <- prob*n;
        y <- x[ceiling(p)];
        names(y) <- paste(prob*100,"%",sep="");
    }
    y;
}
# percentile function: Nearest Rank Method
# x: numeric vector containing the values for which percentile is to be computed
# prob: numeric vector of probability with value in [0,1] for which quantile is wanted
# note: the probability 0 (prob=0) must to be only at the beginning of the vector x; def=seq(0,1,0.25)
quantile.fun2 <- function(x,prob=seq(0,1,0.25)){
    if(any(prob > 1))
        return("stop, prob must be < 1");
    x <- sort(x);
    n <- length(x);
    if(any(prob==0)){
        y <- c();
        y[which(prob==0)] <- min(x);
        p <- prob*n;
        y <- c(y,x[ceiling(p)]);
        names(y) <- paste(prob*100,"%",sep="");
    }else{
        p <- prob*n;
        y <- x[ceiling(p)];
        names(y) <- paste(prob*100,"%",sep="");
    }
    y;
}
# percentile function: Nearest Rank Method
# x: numeric vector containing the values for which percentile is to be computed
# prob: numeric vector of probability with value in [0,1] for which quantile is wanted
# note: the probability 0 (prob=0) can be found in any position of the vector x
quantile.fun3 <- function(x,prob){
    if(any(prob > 1))
        return("stop, prob > [0,1]");
    x <- sort(x);
    n <- length(x);
    if(any(prob==0)){
        y <- rep(0,length(prob));
        names(y) <- paste(prob*100,"%",sep="");
        y[which(prob==0)] <- min(x);
        p <- prob*n;
        y[which(prob!=0)] <- x[ceiling(p)];
    }else{
        p <- prob*n;
        y <- x[ceiling(p)];
        names(y) <- paste(prob*100,"%",sep="");
    }
    y;
}
# percentile function: R or Microsoft Excel Method 
# x: numeric vector containing the values for which percentile is to be computed
# prob: numeric vector of probability with value in [0,1] for which quantile is wanted
# note: the probability 0 (prob=0) can be found in any position of the vector x
quantile.fun4 <- function(x,prob){
    if(any(prob > 1))
        return("stop, prob > [0,1]");
    x <- sort(x);
    n <- length(x);
    alpha <- 1 + (n-1)*prob;
    if(all(alpha - round(alpha)==0)){
        y <- x[alpha]
        names(y) <- paste(prob*100,"%",sep="");
    }else{
        y <- rep(0,length(prob));
        names(y) <- paste(prob*100,"%",sep="");
        y[which(prob!=1)] <- x[floor(alpha[prob!=1])] + (alpha[prob!=1] - floor(alpha[prob!=1])) * (x[floor(alpha[prob!=1])+1] - x[floor(alpha[prob!=1])]);
        y[which(prob==1)] <- max(x);
    }
    y;
}

# absolute frequency function
# x: numeric vector containing the values for which absolute frequency is to be computed
absolute.frequency <- function(x){
    v <- unique(x);
    v <- sort(v);
    y <- rep(0,length(v));
    names(y) <- v;
    for(j in 1:length(v))
        y[j] <- length(which(v[j]==x));
    y;
}
absolute.frequency2 <- function(x){
    v <- unique(x);
    v <- sort(v);
    y <- rep(0,length(v));
    names(y) <- v;
    counter <- 1;
    for(i in v){
        y[counter] <- sum(x==i);
        counter <- counter + 1;
    }
    y;
}

# relative frequency function
# x: numeric vector containing the values for which relative frequency is to be computed
relative.frequency <- function(x){
    abs.freq <- table(x);       #or absolute.frequency(x)
    n <- length(x);
    rel.freq <- abs.freq/n;
    rel.freq;
}

# cumulative frequency function
# x: numeric vector containing the values for which cumulative frequency is to be computed
# absolute: boolean. If true absolute cumulative frequency is computed otherwise relative cumulative frequency is computed   
cumulative.frequency <- function(x,absolute=TRUE){
    if(absolute){
        abs.freq <- table(x);  # absolute.frequency(x)
        cum.rel <- abs.freq[1];
        for(i in 2:length(abs.freq))
            cum.rel <- c(cum.rel, abs.freq[i] + cum.rel[i-1]);
    }else{
        abs.freq <- table(x);  # absolute.frequency(x)
        rel.freq <- abs.freq/length(x);
        cum.rel <- rel.freq[1];
        for(i in 2:length(rel.freq))
            cum.rel <- c(cum.rel, rel.freq[i] + cum.rel[i-1]);
    }
    cum.rel;
}

# mode function
# x: numeric vector containing the values for which mode is to be computed
mode.fun <- function(x){
    abs.freq <- table(x);       # absolute.frequency(x)
    mod <- abs.freq[abs.freq==max(abs.freq)];
    mod;
}
################################################################################
# mean with absolute frequency 
# x <- table(dati[,1])
# sum(as.integer(names(x)) * x) / sum(x)

# mean with with relative frequency
# y <- table(dati[,1])/length(dati[,1])
# sum(as.integer(names(y)) * y)

# median with cumulative frequency...       
#####################################################################################à
################################
##### Heterogeneity Index  #####
################################
# Gini index
# x: numeric vector containing the values for which gini index is to be computed
gini <- function(x){
    f <- table(x)/length(x);        #f <- absolute.frequency(x)/length(x)
    k <- length(f);
    if (k == 1)
       return(0);
    I <- k * (1 - sum(f^2))/(k - 1);
    I;
}

# Shannon Entropy 
# x: numeric vector containing the values for which Shannon entropy is to be computed
entropia <- function(x){
    f <- table(x)/length(x);
    k <- length(f);
    IA <- log(1/f);             # log(1/f) == -log(f)
    HX <- sum(f*IA)/log(k);
    HX;
}
entropia2 <- function(x){
    f <- table(x)/length(x);
    k <- length(f);
    IA <- -log(f,2);
    HX <- sum(f*IA)/log(k,2);
    HX;
}

######################################################################################
################################
##### Concentration Index  #####
################################

#Gini coefficient 
gini.lorenz <- function (x, plot=TRUE){
    x <- sort(x);
    n <- length(x);
    P <- (0:n)/n;           # beginning from 0 for the plot; keep in mind for the Gini index computing 
    Q <- c(0,cumsum(x)/sum(x));
    G <- 1 - (2*sum(Q[-(n+1)])/(n-1));   # or G <- 2 * sum(P-Q)/(n-1)
    if(plot){
        #pdf(file="C:/Users/Marco/Desktop/Analisi_dei_Dati/Lorenz_Curve.pdf");
        plot(P, Q, type = "b", main = "Lorenz Curve");
        polygon(P, Q, density = 25, angle = 90, col="blue", border="red");
        #dev.off();
    }
    res <- list(P=P,Q=Q,G=G);
    return(res);
}
######################################################################################
######################################################################################
matrix.prod <- function(m,z){
    if(ncol(m)!=nrow(z))
        return("error!!")
    x <- matrix(0,nrow(m),ncol(z))
    for(i in 1:nrow(m))
        for(j in 1:ncol(z)) x[i,j] <- sum(m[i,] * z[,j])
    x
}
scalar.prod <- function(x,y){
    res <- 0
    for(i in 1:(length(x)))
        res <- res + x[i]*y[i]
    res
}