################
## EXERCISE 2 ##
################

#0. read data from file
marksly <- scan("markslastyear.txt",sep=",");

#1. absolute and relative frequency 
abs.freq <- table(marksly);
rel.freq <- abs.freq/length(marksly);
cum.abs  <- cumsum(abs.freq);
cum.rel  <- cumsum(rel.freq);

#2. box plot
boxplot(marksly,col="darkgreen",main="marks Box Plot Last Year",yaxt="n",range=1.5,varwidth=T); ##upper whisker end= QR3+0.5*IQR; lower whisker end=QR1-0.5*IQR 
stats <- as.vector(boxplot(marksly,plot=F,range=1.5)$stats);    ##remembering to change range option in according to boxplot range parameter
axis(2,stats,las=2);
abline(h=stats,lty="dotted");
out <- unique(boxplot(marksly,range=1.5,plot=F)$out);           ##remembering to change range option in according to boxplot range parameter
n <- length(out);
identify(rep(1,n),out,labels=out);

#3.0 Empirical Cumulative Distribution Function (Absolute and Relative)
plot.ecdf(marksly,main="Empirical Cumulative Function marks Last Year",
          ylab="cumulative relative frequency",xlab="marks last year",
          cex.main=0.8,xaxt="n", yaxt="n",col="red");
axis(1,names(cum.rel));
#abline(v=names(cum.rel),lty="twodash");
axis(2,cum.rel,las=2,cex.axis=0.5);
#abline(h=cum.rel,lty="dotted");

#3.1 Absolute Empirical Cumulative Function
plot(names(cum.abs),cum.abs,type="s", main="Absolute Empirical Cumulative Function marks Last Year",
     ylab="cumulative absolute frequency", xlab="marks last year", cex.main=0.8, xaxt="n",yaxt="n",col="green");
points(names(cum.abs),cum.abs,col="blue",pch=21,bg="red");
axis(1,names(cum.abs));
#abline(v=names(cum.abs),lty="twodash");
axis(2,cum.abs,las=2,cex.axis=0.5);
#abline(h=cum.abs,lty="dotted");

#3.2 Relative Empirical Cumulative Function
plot(names(cum.rel),cum.rel,type="s", main="Relative Empirical Cumulative Function marks Last Year",
     ylab="cumulative relative frequency", xlab="marks last year", cex.main=0.8, xaxt="n", yaxt="n",col="green");
points(names(cum.rel),cum.rel,col="blue",pch=21,bg="red");
axis(1,names(cum.abs));
#abline(v=names(cum.rel),lty="twodash");
axis(2,cum.rel,las=2,cex.axis=0.5);
#abline(h=cum.rel,lty="dotted");

#4. Histogram
## try setting breaks=40 and breaks=60 and compute the mean using histogram
## mean with breaks=40/60: sum(mid*den*0.2) (0.2 is the bin width) 
hist(marksly,freq=F,breaks=15,col="darkblue",axes=F);
den <- hist(marksly,breaks=15,plot=F)$density;
den.plot <- unique(hist(marksly,breaks=15,plot=F)$density);
brk <- hist(marksly,breaks=15,plot=F)$breaks;
mid <- hist(marksly,breaks=15,plot=F)$mids;
axis(1,at= brk,las=2);
axis(2,at= c(0,round(den.plot,4)),las=1);
abline(h=den,lty="dotted");

## marks comparison 
marks <- scan("marks.txt",sep=",");
#marksly <- scan("markslastyear.txt",sep=",");

#ecdf
xmin <- c(min(min(marks),min(marksly)),max(marksly))
plot(ecdf(marks),xlim=xmin,xaxt="n");
plot(ecdf(marksly),add=T,col="red",main="ecdfs comparison",xaxt="n");
axis(1,at=min(min(marks),min(marksly)): max(max(marks),max(marksly)));
#qqnorm
qqnorm(marksly);
qqline(marksly);
#qqplot
qqplot(marks,marksly);