################
## EXERCISE 3 ##
################

#0. read data from file
marks <- read.table("dati_esercitazione_voti.txt",sep="\t",header=T);   ##write: col1; oral.1: col2; oral.2: col3, oral.3: col4

#1. compute mean and variance of written marks
mw <- mean(marks[,1]);
vw <- var(marks[,1]);

#2. compute mean and variance of oral_1 marks
mo <- mean(marks[,2]);
vo <- var(marks[,2]);

#3.0 written: relative frequency plot
wfr <- table(marks[,1])/nrow(marks);
plot(wfr,type="h",xlab="written_marks",ylab="relative frequency",lwd=5,xaxt="n",yaxt="n",col="darkblue");
axis(1,marks[,1],las=1);
axis(2,unique(wfr),las=1,cex.axis=0.8);


######################################
## case study A: written vs oral_1  ##
######################################
### written and oral_1 test marks are similar (same mean, variance and distribution), but they are independent variables (see covariance and correlation)

#3.1 oral_1: relative frequency plot
o1fr <- table(marks[,2])/nrow(marks);
plot(o1fr,type="h",xlab="oral_1_marks",ylab="relative frequency",lwd=5,xaxt="n",yaxt="n",col="darkgreen");
axis(1,marks[,2],las=1);
axis(2,unique(o1fr),las=1,cex.axis=0.8);

#4 empirical cumulative function between written and oral_1 marks
##way 1
plot(ecdf(marks[,1]),main="marks distribution comparison between written and oral_1 marsk test");
plot(ecdf(marks[,2]),add=T,col="red");
##way 2
co <- cumsum(table(marks[,1])/nrow(marks));
cw <- cumsum(table(marks[,2])/nrow(marks));
plot(names(cw),cw,type="s",main="marks distribution comparison between written and oral_1 marsk test",ylab="F(x)");
lines(names(co),co,type="s",col="gold");

#5 contingency table
table(marks[,1],marks[,2]);
my.contingency.table <- function(x,y){
xf <- table(x);
yf <- table(y);

m <- matrix(0,length(xf),length(yf));
dimnames(m) <- list(names(xf),names(yf));

for(i in rownames(m)){
z <- table(y[which(x==i)]);     ##z <- absolute.frequency(y[which(x==i)])   (see analisidati.R) 
m[i,names(z)] <- z;
}
m;
}
my.contingency.table(marks[,1],marks[,2]);

#6. oral_1: scatter plot 
plot(marks[,2],marks[,1],pch=20,col="gold",xlab="oral_1",ylab="written");

#7. correlation and covariance
cov(marks[,1],marks[,2]);
cor(marks[,1],marks[,2]);


######################################
## case study B: written vs oral_2  ##
######################################
### written and oral_2 test marks are dependent but not linearly related (see covariance and correlation)

#8 oral_2: relative frequency plot
o2fr <- table(marks[,3])/nrow(marks);
plot(o2fr,type="h",xlab="oral_2_marks",ylab="relative frequency",lwd=5,xaxt="n",yaxt="n",col="darkgreen");
axis(1,marks[,3],las=1);
axis(2,unique(o2fr),las=1,cex.axis=0.8);

#9. empirical cumulative function between written and oral_1 marks
##way 1
plot(ecdf(marks[,1]),main="marks distribution comparison between written and oral_2 marks test");
plot(ecdf(marks[,3]),add=T,col="red");
##way 2
co <- cumsum(table(marks[,1])/nrow(marks));
cw <- cumsum(table(marks[,3])/nrow(marks));
plot(names(cw),cw,type="s",main="marks distribution comparison between written and oral_2 marks test",ylab="F(x)",xlab="marks");
lines(names(co),co,type="s",col="gold");

#10. contingency table
table(marks[,1],marks[,3])      ##my.contingency.table(marks[,1],marks[,3])

#11. oral_1: scatter plot 
plot(marks[,3],marks[,1],pch=20,col="gold",xlab="oral_2",ylab="written");

#12. correlation and covariance
cov(marks[,1],marks[,3]);
cor(marks[,1],marks[,3]);


######################################
## case study C: written vs oral_3  ##
######################################
### written and oral_3 test marks have different mean, variance and distribution, but they are linearly related

#13 oral_3: relative frequency plot
o3fr <- table(marks[,4])/nrow(marks);
plot(o3fr,type="h",xlab="oral_3_marks",ylab="relative frequency",lwd=5,xaxt="n",yaxt="n",col="darkgreen");
axis(1,marks[,4],las=1);
axis(2,unique(o3fr),las=1,cex.axis=0.8);

#14. empirical cumulative function between written and oral_1 marks
##way 1
plot(ecdf(marks[,1]),main="marks distribution comparison between written and oral_3 marks test");
plot(ecdf(marks[,4]),add=T,col="blue");
##way 2
co <- cumsum(table(marks[,1])/nrow(marks));
cw <- cumsum(table(marks[,4])/nrow(marks));
plot(names(co),co,type="s",main="marks distribution: comparison between written and oral_3 test marks",ylab="F(x)",xlab="marks");
lines(names(cw),cw,type="s",col="gold");

#15. contingency table
table(marks[,1],marks[,4])      ##my.contingency.table(marks[,1],marks[,4])

#16. oral_1: scatter plot 
plot(marks[,4],marks[,1],pch=20,col="gold",xlab="oral_2",ylab="written");

#17. correlation and covariance
cov(marks[,1],marks[,4]);
cor(marks[,1],marks[,4]);