################################################
#### Descriptive Statistics: Main Functions ####
################################################
# mean function
# x: numeric vector containing the values for which mean is to be computed
mean.fun <- function(x){
if(any(is.na(x)))
x <- x[!is.na(x)];
res <- 0;
for(i in 1:length(x))
res <- res + x[i];
av <- res/length(x);
av;
}
# variance function
# x: numeric vector containing the values for which variance is to be computed
var.fun <- function(x){
if(any(is.na(x)))
x <- x[!is.na(x)];
y <- 0;
for(i in 1:length(x))
y <- y + (x[i] - mean.fun(x))^2;
var <- y/(length(x)-1);
var;
}
# standard deviation function
# x: numeric vector containing the values for which standard deviation is to be computed
sd.fun <- function(x){
return(sqrt(var.fun(x)))
}
# trimmed mean function
# x: numeric vector containing the values for which trimmed mean is to be computed
# note: if trim > 0.5 the function mean of the package base of R return the median value (see help)
trimmed.mean <- function(x,trim){
if(trim>=0.5)
return(median(x)); #or return("error, trim must be < 0.5") ---> maybe the better solution
x <- sort(x);
outlier <- floor(trim * length(x));
trim.mean <- mean(x[(outlier+1):(length(x)-outlier)]);
trim.mean;
}
trimmed.mean2 <- function(x,trim){
if(trim>=0.5)
return(median(x)); #or return("error, trim must be < 0.5")
x <- sort(x);
l <- length(x);
lo <- floor(trim * length(x)) + 1;
hi <- l - lo + 1;
trim.mean <- mean(x[lo:hi])
trim.mean;
}
# median function
# x: numeric vector containing the values for which median is to be computed
median.fun <- function(x){
x <- sort(x); #note: if the vector x contains same NA values sort function removes it
n <- length(x);
r <- n%%2;
if(r==0){
y <- 0.5 * (x[n/2] + x[n/2+1])
}else{
y <- x[(n+1)/2]
}
y;
}
# percentile function: Nearest Rank Method
# x: numeric vector containing the values for which percentile is to be computed
# prob: probability (value in [0,1]) for which quantile is wanted
quantile.fun <- function(x,prob){
if(any(prob > 1))
return("stop, prob must be < 1");
if(any(prob==0)){
y <- min(x);
names(y) <- "0%";
}else{
x <- sort(x);
n <- length(x);
p <- prob*n;
y <- x[ceiling(p)];
names(y) <- paste(prob*100,"%",sep="");
}
y;
}
# percentile function: Nearest Rank Method
# x: numeric vector containing the values for which percentile is to be computed
# prob: numeric vector of probability with value in [0,1] for which quantile is wanted
# note: the probability 0 (prob=0) must to be only at the beginning of the vector x; def=seq(0,1,0.25)
quantile.fun2 <- function(x,prob=seq(0,1,0.25)){
if(any(prob > 1))
return("stop, prob must be < 1");
x <- sort(x);
n <- length(x);
if(any(prob==0)){
y <- c();
y[which(prob==0)] <- min(x);
p <- prob*n;
y <- c(y,x[ceiling(p)]);
names(y) <- paste(prob*100,"%",sep="");
}else{
p <- prob*n;
y <- x[ceiling(p)];
names(y) <- paste(prob*100,"%",sep="");
}
y;
}
# percentile function: Nearest Rank Method
# x: numeric vector containing the values for which percentile is to be computed
# prob: numeric vector of probability with value in [0,1] for which quantile is wanted
# note: the probability 0 (prob=0) can be found in any position of the vector x
quantile.fun3 <- function(x,prob){
if(any(prob > 1))
return("stop, prob > [0,1]");
x <- sort(x);
n <- length(x);
if(any(prob==0)){
y <- rep(0,length(prob));
names(y) <- paste(prob*100,"%",sep="");
y[which(prob==0)] <- min(x);
p <- prob*n;
y[which(prob!=0)] <- x[ceiling(p)];
}else{
p <- prob*n;
y <- x[ceiling(p)];
names(y) <- paste(prob*100,"%",sep="");
}
y;
}
# percentile function: R or Microsoft Excel Method
# x: numeric vector containing the values for which percentile is to be computed
# prob: numeric vector of probability with value in [0,1] for which quantile is wanted
# note: the probability 0 (prob=0) can be found in any position of the vector x
quantile.fun4 <- function(x,prob){
if(any(prob > 1))
return("stop, prob > [0,1]");
x <- sort(x);
n <- length(x);
alpha <- 1 + (n-1)*prob;
if(all(alpha - round(alpha)==0)){
y <- x[alpha]
names(y) <- paste(prob*100,"%",sep="");
}else{
y <- rep(0,length(prob));
names(y) <- paste(prob*100,"%",sep="");
y[which(prob!=1)] <- x[floor(alpha[prob!=1])] + (alpha[prob!=1] - floor(alpha[prob!=1])) * (x[floor(alpha[prob!=1])+1] - x[floor(alpha[prob!=1])]);
y[which(prob==1)] <- max(x);
}
y;
}
# absolute frequency function
# x: numeric vector containing the values for which absolute frequency is to be computed
absolute.frequency <- function(x){
v <- unique(x);
v <- sort(v);
y <- rep(0,length(v));
names(y) <- v;
for(j in 1:length(v))
y[j] <- length(which(v[j]==x));
y;
}
absolute.frequency2 <- function(x){
v <- unique(x);
v <- sort(v);
y <- rep(0,length(v));
names(y) <- v;
counter <- 1;
for(i in v){
y[counter] <- sum(x==i);
counter <- counter + 1;
}
y;
}
# relative frequency function
# x: numeric vector containing the values for which relative frequency is to be computed
relative.frequency <- function(x){
abs.freq <- table(x); #or absolute.frequency(x)
n <- length(x);
rel.freq <- abs.freq/n;
rel.freq;
}
# cumulative frequency function
# x: numeric vector containing the values for which cumulative frequency is to be computed
# absolute: boolean. If true absolute cumulative frequency is computed otherwise relative cumulative frequency is computed
cumulative.frequency <- function(x,absolute=TRUE){
if(absolute){
abs.freq <- table(x); # absolute.frequency(x)
cum.rel <- abs.freq[1];
for(i in 2:length(abs.freq))
cum.rel <- c(cum.rel, abs.freq[i] + cum.rel[i-1]);
}else{
abs.freq <- table(x); # absolute.frequency(x)
rel.freq <- abs.freq/length(x);
cum.rel <- rel.freq[1];
for(i in 2:length(rel.freq))
cum.rel <- c(cum.rel, rel.freq[i] + cum.rel[i-1]);
}
cum.rel;
}
# mode function
# x: numeric vector containing the values for which mode is to be computed
mode.fun <- function(x){
abs.freq <- table(x); # absolute.frequency(x)
mod <- abs.freq[abs.freq==max(abs.freq)];
mod;
}
################################################################################
# mean with absolute frequency
# x <- table(dati[,1])
# sum(as.integer(names(x)) * x) / sum(x)
# mean with with relative frequency
# y <- table(dati[,1])/length(dati[,1])
# sum(as.integer(names(y)) * y)
# median with cumulative frequency...
#####################################################################################à
################################
##### Heterogeneity Index #####
################################
# Gini index
# x: numeric vector containing the values for which gini index is to be computed
gini <- function(x){
f <- table(x)/length(x); #f <- absolute.frequency(x)/length(x)
k <- length(f);
if (k == 1)
return(0);
I <- k * (1 - sum(f^2))/(k - 1);
I;
}
# Shannon Entropy
# x: numeric vector containing the values for which Shannon entropy is to be computed
entropia <- function(x){
f <- table(x)/length(x);
k <- length(f);
IA <- log(1/f); # log(1/f) == -log(f)
HX <- sum(f*IA)/log(k);
HX;
}
entropia2 <- function(x){
f <- table(x)/length(x);
k <- length(f);
IA <- -log(f,2);
HX <- sum(f*IA)/log(k,2);
HX;
}
######################################################################################
################################
##### Concentration Index #####
################################
#Gini coefficient
gini.lorenz <- function (x, plot=TRUE){
x <- sort(x);
n <- length(x);
P <- (0:n)/n; # beginning from 0 for the plot; keep in mind for the Gini index computing
Q <- c(0,cumsum(x)/sum(x));
G <- 1 - (2*sum(Q[-(n+1)])/(n-1)); # or G <- 2 * sum(P-Q)/(n-1)
if(plot){
#pdf(file="C:/Users/Marco/Desktop/Analisi_dei_Dati/Lorenz_Curve.pdf");
plot(P, Q, type = "b", main = "Lorenz Curve");
polygon(P, Q, density = 25, angle = 90, col="blue", border="red");
#dev.off();
}
res <- list(P=P,Q=Q,G=G);
return(res);
}
######################################################################################
######################################################################################
matrix.prod <- function(m,z){
if(ncol(m)!=nrow(z))
return("error!!")
x <- matrix(0,nrow(m),ncol(z))
for(i in 1:nrow(m))
for(j in 1:ncol(z)) x[i,j] <- sum(m[i,] * z[,j])
x
}
scalar.prod <- function(x,y){
res <- 0
for(i in 1:(length(x)))
res <- res + x[i]*y[i]
res
}