algorithmica-repository
diff --git a/‎1.introduction/commands.txt‎
Lines changed: 56 additions & 0 deletions b/‎1.introduction/commands.txt‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎11-naive-bayes/naivebayes1.R‎
Lines changed: 87 additions & 0 deletions b/‎11-naive-bayes/naivebayes1.R‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎11-naive-bayes/naivebayes2.R‎
Lines changed: 91 additions & 0 deletions b/‎11-naive-bayes/naivebayes2.R‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎12-decisiontree/decision-trees.R‎
Lines changed: 82 additions & 0 deletions b/‎12-decisiontree/decision-trees.R‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎13-linear-regression/linear-regression1.R‎
Lines changed: 21 additions & 0 deletions b/‎13-linear-regression/linear-regression1.R‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎13-linear-regression/linear-regression2.R‎
Lines changed: 24 additions & 0 deletions b/‎13-linear-regression/linear-regression2.R‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎13-linear-regression/linear-regression3.R‎
Lines changed: 32 additions & 0 deletions b/‎13-linear-regression/linear-regression3.R‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎logistic-regression/logistic-regression.R‎ renamed to ‎14-logistic-regression/logistic-regression.R‎ b/‎logistic-regression/logistic-regression.R‎ renamed to ‎14-logistic-regression/logistic-regression.R‎
@@ -0,0 +1,56 @@
+help related commands
+--------------------
+help.start()
+help(name) / ?name
+example(name)
+
+workspace commands
+------------------
+ls()
+ls(pattern="")
+rm(object name)
+data()
+source("script-file")
+getwd(), setwd()
+save.image()
+
+history commands
+---------------
+history()
+history(max.show=Inf)
+savehistory(file="")
+loadhistory(file="")
+
+packages commands
+---------------------
+install.packages("packagename")
+install.packages("packagename",repos="")
+library(packagename)
+library() -- shows all packages installed on machine(may or maynot be loaded into R workspace)
+search() -- show all the packages loaded into R workspace and also shows the search path order
+
+
+common functions
+---------------------
+length(object) # number of elements or components
+str(object) ���# structure of an object 
+class(object) �# class or type of an object
+names(object) �# names
+
+head(mydata, n=10)/head(mydata, 10)
+
+tail(mydata, n=5)/tail(mydata, 5)
+colnames(mydata)
+rownames(mydata)
+summary(mydata)
+
+summary functions
+----------------
+mean(x)/mean(x, na.rm=TRUE)
+median(x)
+table(x)
+var(x)
+sd(x)
+max(x)-min(x)
+IQR(x)
+quantile(x)
@@ -0,0 +1,87 @@
+# to simplify selections
+library(dplyr)
+# for stemming the words
+library(SnowballC)
+# libraries required by caret
+library(klaR)
+library(e1071)
+# for the Naive Bayes modelling
+library(caret)
+# to process the text into a corpus
+library(tm)
+
+# Set seed for reproducibility
+set.seed(1234)
+
+# Read the data
+setwd("E:/data analytics/datasets")
+sms_raw = read.table(unz("smsspamcollection.zip","SMSSpamCollection"),
+                      header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE)
+# Explore the dataset
+dim(sms_raw)
+str(sms_raw)
+head(sms_raw)
+
+colnames(sms_raw) = c("type", "text")
+sms_raw$type = factor(sms_raw$type)
+
+# Preparing the dataset
+sms_corpus = Corpus(VectorSource(sms_raw$text))
+
+as.character(sms_corpus[[1]])
+inspect(sms_corpus[1:10])
+
+#To avoid the issue with DocumentTermMatrix method, use one of following solutions:
+#1) Adding content_transformer avoids the type conversion issue with non-standard transformations
+#2) Add the tm_map(PlainTextDocument) after all the cleaning is done
+
+sms_corpus_clean = sms_corpus %>%
+  tm_map(content_transformer(tolower)) %>% 
+  tm_map(removeNumbers) %>%
+  tm_map(removePunctuation) %>%
+  tm_map(removeWords, stopwords(kind="en")) %>%
+  tm_map(stripWhitespace) %>%
+  tm_map(stemDocument) 
+
+inspect(sms_corpus_clean[1:10])
+
+sms_corpus_clean = DocumentTermMatrix(sms_corpus_clean,control=list(minWordLength=2))
+dim(sms_corpus_clean)
+sms_corpus_clean = removeSparseTerms(sms_corpus_clean,0.98)
+dim(sms_corpus_clean)
+inspect(sms_corpus_clean[1:10,1:10])
+
+# Convert the dtm into boolean values instead of term frequencies
+convert_counts <- function(x) {
+  x = ifelse(x > 0, 1, 0)
+  x = factor(x, levels = c(0, 1), labels = c("No", "Yes"))
+}
+sms_corpus_clean_binary= sms_corpus_clean %>% apply(MARGIN=2, FUN=convert_counts)
+dim(sms_corpus_clean_binary)
+sms_corpus_clean_binary[1:10,1:10]
+
+
+#Train the model
+ctrl = trainControl(method="cv", 10)
+sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", trControl=ctrl)
+sms_model
+
+
+ctrl = trainControl(method="cv", 10)
+
+sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", trControl=ctrl)
+str(sms_model)
+sms_model$trainingData
+sms_model$resample
+sms_model$time
+
+#Test the model
+sms_predict = predict(sms_model, sms_corpus_clean_binary, type="prob")
+str(sms_predict)
+head(sms_predict)
+
+cm = confusionMatrix(sms_predict, sms_raw$type, positive="spam")
+cm
+
+
+
@@ -0,0 +1,91 @@
+# to simplify selections
+library(dplyr)
+# for stemming the words
+library(SnowballC)
+# libraries required by caret
+library(klaR)
+library(e1071)
+# for the Naive Bayes modelling
+library(caret)
+# to process the text into a corpus
+library(tm)
+# to get nice looking tables
+library(pander)
+
+# Set seed for reproducibility
+set.seed(1234)
+
+frqtab = function(x, caption) {
+  round(100*prop.table(table(x)), 1)
+}
+
+# Read the data
+setwd("E:/data analytics/datasets")
+sms_raw = read.table(unz("smsspamcollection.zip","SMSSpamCollection"),
+                      header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE)
+sms_raw = sms_raw[sample(nrow(sms_raw)),]
+
+# Explore the dataset
+dim(sms_raw)
+str(sms_raw)
+head(sms_raw)
+
+colnames(sms_raw) = c("type", "text")
+sms_raw$type = factor(sms_raw$type)
+
+# Preparing the dataset
+sms_corpus = Corpus(VectorSource(sms_raw$text))
+
+inspect(sms_corpus[1:10])
+
+#To avoid the issue with DocumentTermMatrix method, use one of following solutions:
+#1) Adding content_transformer avoids the type conversion issue with non-standard transformations
+#2) Add the tm_map(PlainTextDocument) after all the cleaning is done
+
+sms_corpus_clean = sms_corpus %>%
+  tm_map(content_transformer(tolower)) %>% 
+  tm_map(removeNumbers) %>%
+  tm_map(removePunctuation) %>%
+  tm_map(removeWords, stopwords(kind="en")) %>%
+  tm_map(stripWhitespace) %>%
+  tm_map(stemDocument) 
+
+inspect(sms_corpus_clean[1:10])
+
+sms_corpus_clean = DocumentTermMatrix(sms_corpus_clean,control=list(minWordLength=2))
+dim(sms_corpus_clean)
+inspect(sms_corpus_clean[1:10,1:10])
+
+# Convert the dtm into boolean values instead of term frequencies
+convert_counts <- function(x) {
+  x = ifelse(x > 0, 1, 0)
+  x = factor(x, levels = c(0, 1), labels = c("No", "Yes"))
+}
+sms_corpus_clean_binary= sms_corpus_clean %>% apply(MARGIN=2, FUN=convert_counts)
+dim(sms_corpus_clean_binary)
+sms_corpus_clean_binary[1:10,1:10]
+
+
+#Train the model
+ctrl = trainControl(method="cv", 10)
+sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", trControl=ctrl)
+sms_model
+
+
+ctrl = trainControl(method="cv", 10)
+
+grid=data.frame(.fL=c(0,1), .usekernel=FALSE)
+
+sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", tuneGrid = grid, trControl=ctrl)
+sms_model
+
+#Test the model
+sms_predict = predict(sms_model, sms_corpus_clean_binary, type="prob")
+str(sms_predict)
+head(sms_predict)
+
+cm = confusionMatrix(sms_predict, sms_raw$type, positive="spam")
+cm
+
+
+
@@ -0,0 +1,82 @@
+library(caret)
+library(randomForest)
+library(ggplot2)
+library(Amelia)
+
+setwd("E:/data analytics/kaggle/titanic/data")
+
+trainSet = read.csv("train.csv", header = TRUE, na.strings=c("NA",""))
+dim(trainSet)
+str(trainSet)
+head(trainSet)
+trainSet$Survived = factor(trainSet$Survived)
+trainSet$Pclass = factor(trainSet$Pclass)
+summary(trainSet)
+
+missmap(trainSet, main="Titanic Training Data - Missings Map", 
+        col=c("yellow", "black"), legend=FALSE)
+
+table(trainSet$Survived)
+ggplot(trainSet, aes(x = Survived)) + geom_bar()
+
+#Comparing Survived and passenger class using table and histograms
+summary(trainSet$Pclass)
+xtabs(~Survived + Pclass, data=trainSet)
+ggplot(trainSet, aes(x = Survived, fill = Pclass)) + geom_bar()
+
+#Comparing Survived and Sex using table and histograms
+summary(trainSet$Sex)
+xtabs(~Survived + Sex, data=trainSet)
+ggplot(trainSet, aes(x = Survived, fill = Sex)) + geom_bar()
+
+
+#Comparing Survived and Embarked using table and histograms
+summary(trainSet$Embarked)
+xtabs(~Survived + Embarked, data=trainSet)
+ggplot(trainSet, aes(x = Survived, fill = Embarked)) + geom_bar()
+
+# Comparing Age and Survived: The boxplots are very similar between Age
+# for survivors and those who died. 
+xtabs(~Survived + Age, data=trainSet)
+ggplot(trainSet, aes(x = Survived, y = Age)) + geom_boxplot() 
+# Also, there are lots of NA's. Exclude this variable
+summary(trainSet$Age)
+
+# Comparing Survived and Fare: The boxplots are much different between 
+# fare for survivors and those who died.
+ggplot(trainSet, aes(x = Survived, y = Fare)) + geom_boxplot() 
+# Also, there are no NA's. Include this variable.
+summary(trainSet$Fare)
+
+# Comparing Survived and Parch
+ggplot(trainSet, aes(x = Survived, y = Parch)) + geom_boxplot() 
+summary(trainSet$Parch)
+
+# Set a random seed 
+set.seed(42)
+
+#model tuning strategy
+ctrl = trainControl(method = "cv", # Use cross-validation
+                    number = 10) # Use 10 folds for cross-validation
+
+# Train the model using a "random forest" algorithm
+model_rf = train(Survived ~ Pclass + Sex + Age + Embarked + SibSp + Fare, 
+                 data = trainSet, 
+                 method = "rpart",
+                 trControl = ctrl)
+model_rf
+
+testSet = read.table("test.csv", sep = ",", header = TRUE)
+dim(testSet)
+str(testSet)
+head(testSet)
+testSet$Pclass = factor(testSet$Pclass)
+summary(testSet)
+testSet$Fare = ifelse(is.na(testSet$Fare), mean(testSet$Fare, na.rm = TRUE), testSet$Fare)
+
+
+testSet$Survived = predict(model_logit, newdata = testSet)
+
+submission = testSet[,c("PassengerId", "Survived")]
+
+write.table(submission, file = "submission.csv", col.names = TRUE, row.names = FALSE, sep = ",")
@@ -0,0 +1,21 @@
+library(ggplot2)
+library(dplyr)
+library(klaR)
+library(e1071)
+library(caret)
+library(Lock5Data)
+data(RestaurantTips)
+
+dim(RestaurantTips)
+str(RestaurantTips)
+head(RestaurantTips)
+
+ctrl = trainControl(method="cv", 10)
+
+reg_model = train(Tip ~ ., data=RestaurantTips, method="lm", trControl=ctrl)
+reg_model
+reg_model$finalModel
+reg_model$finalModel$residuals
+
+predicted = predict(reg_model, RestaurantTips)
+str(predicted)
@@ -0,0 +1,24 @@
+library(ggplot2)
+library(dplyr)
+library(klaR)
+library(e1071)
+library(caret)
+library(Lock5Data)
+data(RestaurantTips)
+
+dim(RestaurantTips)
+str(RestaurantTips)
+head(RestaurantTips)
+  
+RestaurantTips$attr1 = RestaurantTips$Bill + RestaurantTips$Guests
+
+RestaurantTips$attr2 = RestaurantTips$Bill + RestaurantTips$Guests + rnorm(157,0,1) *0.05
+
+ctrl = trainControl(method="cv", 10)
+
+reg_model = train(Tip ~ Bill +  Guests  + attr2, data=RestaurantTips, method="lm", trControl=ctrl)
+reg_model
+reg_model$finalModel
+
+predicted = predict(reg_model, RestaurantTips)
+str(predicted)
@@ -0,0 +1,32 @@
+library(ggplot2)
+library(dplyr)
+library(klaR)
+library(e1071)
+library(caret)
+library(Lock5Data)
+data(RestaurantTips)
+
+dim(RestaurantTips)
+str(RestaurantTips)
+head(RestaurantTips)
+  
+RestaurantTips$attr1 = RestaurantTips$Bill + RestaurantTips$Guests
+
+RestaurantTips$attr2 = RestaurantTips$Bill + RestaurantTips$Guests + rnorm(157,0,1) *0.05
+
+ctrl = trainControl(method="cv", 10)
+
+reg_model1 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="lm", trControl=ctrl)
+
+reg_model2 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="ridge", trControl=ctrl)
+
+reg_model3 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="lasso", trControl=ctrl)
+
+reg_model1
+reg_model1$finalModel
+
+reg_model2
+reg_model2$finalModel
+
+reg_model3
+reg_model3$finalModel