Skip to content

Commit 931e63f

Browse files
committed
Merge pull request #1 from algorithmica-repository/master
Update from original
2 parents 51c9125 + ae3aefb commit 931e63f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1017846
-261
lines changed

1.introduction/commands.txt

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
help related commands
2+
--------------------
3+
help.start()
4+
help(name) / ?name
5+
example(name)
6+
7+
workspace commands
8+
------------------
9+
ls()
10+
ls(pattern="")
11+
rm(object name)
12+
data()
13+
source("script-file")
14+
getwd(), setwd()
15+
save.image()
16+
17+
history commands
18+
---------------
19+
history()
20+
history(max.show=Inf)
21+
savehistory(file="")
22+
loadhistory(file="")
23+
24+
packages commands
25+
---------------------
26+
install.packages("packagename")
27+
install.packages("packagename",repos="")
28+
library(packagename)
29+
library() -- shows all packages installed on machine(may or maynot be loaded into R workspace)
30+
search() -- show all the packages loaded into R workspace and also shows the search path order
31+
32+
33+
common functions
34+
---------------------
35+
length(object) # number of elements or components
36+
str(object) ���# structure of an object
37+
class(object) �# class or type of an object
38+
names(object) �# names
39+
40+
head(mydata, n=10)/head(mydata, 10)
41+
42+
tail(mydata, n=5)/tail(mydata, 5)
43+
colnames(mydata)
44+
rownames(mydata)
45+
summary(mydata)
46+
47+
summary functions
48+
----------------
49+
mean(x)/mean(x, na.rm=TRUE)
50+
median(x)
51+
table(x)
52+
var(x)
53+
sd(x)
54+
max(x)-min(x)
55+
IQR(x)
56+
quantile(x)

11-naive-bayes/naivebayes1.R

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# to simplify selections
2+
library(dplyr)
3+
# for stemming the words
4+
library(SnowballC)
5+
# libraries required by caret
6+
library(klaR)
7+
library(e1071)
8+
# for the Naive Bayes modelling
9+
library(caret)
10+
# to process the text into a corpus
11+
library(tm)
12+
13+
# Set seed for reproducibility
14+
set.seed(1234)
15+
16+
# Read the data
17+
setwd("E:/data analytics/datasets")
18+
sms_raw = read.table(unz("smsspamcollection.zip","SMSSpamCollection"),
19+
header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE)
20+
# Explore the dataset
21+
dim(sms_raw)
22+
str(sms_raw)
23+
head(sms_raw)
24+
25+
colnames(sms_raw) = c("type", "text")
26+
sms_raw$type = factor(sms_raw$type)
27+
28+
# Preparing the dataset
29+
sms_corpus = Corpus(VectorSource(sms_raw$text))
30+
31+
as.character(sms_corpus[[1]])
32+
inspect(sms_corpus[1:10])
33+
34+
#To avoid the issue with DocumentTermMatrix method, use one of following solutions:
35+
#1) Adding content_transformer avoids the type conversion issue with non-standard transformations
36+
#2) Add the tm_map(PlainTextDocument) after all the cleaning is done
37+
38+
sms_corpus_clean = sms_corpus %>%
39+
tm_map(content_transformer(tolower)) %>%
40+
tm_map(removeNumbers) %>%
41+
tm_map(removePunctuation) %>%
42+
tm_map(removeWords, stopwords(kind="en")) %>%
43+
tm_map(stripWhitespace) %>%
44+
tm_map(stemDocument)
45+
46+
inspect(sms_corpus_clean[1:10])
47+
48+
sms_corpus_clean = DocumentTermMatrix(sms_corpus_clean,control=list(minWordLength=2))
49+
dim(sms_corpus_clean)
50+
sms_corpus_clean = removeSparseTerms(sms_corpus_clean,0.98)
51+
dim(sms_corpus_clean)
52+
inspect(sms_corpus_clean[1:10,1:10])
53+
54+
# Convert the dtm into boolean values instead of term frequencies
55+
convert_counts <- function(x) {
56+
x = ifelse(x > 0, 1, 0)
57+
x = factor(x, levels = c(0, 1), labels = c("No", "Yes"))
58+
}
59+
sms_corpus_clean_binary= sms_corpus_clean %>% apply(MARGIN=2, FUN=convert_counts)
60+
dim(sms_corpus_clean_binary)
61+
sms_corpus_clean_binary[1:10,1:10]
62+
63+
64+
#Train the model
65+
ctrl = trainControl(method="cv", 10)
66+
sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", trControl=ctrl)
67+
sms_model
68+
69+
70+
ctrl = trainControl(method="cv", 10)
71+
72+
sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", trControl=ctrl)
73+
str(sms_model)
74+
sms_model$trainingData
75+
sms_model$resample
76+
sms_model$time
77+
78+
#Test the model
79+
sms_predict = predict(sms_model, sms_corpus_clean_binary, type="prob")
80+
str(sms_predict)
81+
head(sms_predict)
82+
83+
cm = confusionMatrix(sms_predict, sms_raw$type, positive="spam")
84+
cm
85+
86+
87+

11-naive-bayes/naivebayes2.R

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# to simplify selections
2+
library(dplyr)
3+
# for stemming the words
4+
library(SnowballC)
5+
# libraries required by caret
6+
library(klaR)
7+
library(e1071)
8+
# for the Naive Bayes modelling
9+
library(caret)
10+
# to process the text into a corpus
11+
library(tm)
12+
# to get nice looking tables
13+
library(pander)
14+
15+
# Set seed for reproducibility
16+
set.seed(1234)
17+
18+
frqtab = function(x, caption) {
19+
round(100*prop.table(table(x)), 1)
20+
}
21+
22+
# Read the data
23+
setwd("E:/data analytics/datasets")
24+
sms_raw = read.table(unz("smsspamcollection.zip","SMSSpamCollection"),
25+
header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE)
26+
sms_raw = sms_raw[sample(nrow(sms_raw)),]
27+
28+
# Explore the dataset
29+
dim(sms_raw)
30+
str(sms_raw)
31+
head(sms_raw)
32+
33+
colnames(sms_raw) = c("type", "text")
34+
sms_raw$type = factor(sms_raw$type)
35+
36+
# Preparing the dataset
37+
sms_corpus = Corpus(VectorSource(sms_raw$text))
38+
39+
inspect(sms_corpus[1:10])
40+
41+
#To avoid the issue with DocumentTermMatrix method, use one of following solutions:
42+
#1) Adding content_transformer avoids the type conversion issue with non-standard transformations
43+
#2) Add the tm_map(PlainTextDocument) after all the cleaning is done
44+
45+
sms_corpus_clean = sms_corpus %>%
46+
tm_map(content_transformer(tolower)) %>%
47+
tm_map(removeNumbers) %>%
48+
tm_map(removePunctuation) %>%
49+
tm_map(removeWords, stopwords(kind="en")) %>%
50+
tm_map(stripWhitespace) %>%
51+
tm_map(stemDocument)
52+
53+
inspect(sms_corpus_clean[1:10])
54+
55+
sms_corpus_clean = DocumentTermMatrix(sms_corpus_clean,control=list(minWordLength=2))
56+
dim(sms_corpus_clean)
57+
inspect(sms_corpus_clean[1:10,1:10])
58+
59+
# Convert the dtm into boolean values instead of term frequencies
60+
convert_counts <- function(x) {
61+
x = ifelse(x > 0, 1, 0)
62+
x = factor(x, levels = c(0, 1), labels = c("No", "Yes"))
63+
}
64+
sms_corpus_clean_binary= sms_corpus_clean %>% apply(MARGIN=2, FUN=convert_counts)
65+
dim(sms_corpus_clean_binary)
66+
sms_corpus_clean_binary[1:10,1:10]
67+
68+
69+
#Train the model
70+
ctrl = trainControl(method="cv", 10)
71+
sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", trControl=ctrl)
72+
sms_model
73+
74+
75+
ctrl = trainControl(method="cv", 10)
76+
77+
grid=data.frame(.fL=c(0,1), .usekernel=FALSE)
78+
79+
sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", tuneGrid = grid, trControl=ctrl)
80+
sms_model
81+
82+
#Test the model
83+
sms_predict = predict(sms_model, sms_corpus_clean_binary, type="prob")
84+
str(sms_predict)
85+
head(sms_predict)
86+
87+
cm = confusionMatrix(sms_predict, sms_raw$type, positive="spam")
88+
cm
89+
90+
91+

12-decisiontree/decision-trees.R

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
library(caret)
2+
library(randomForest)
3+
library(ggplot2)
4+
library(Amelia)
5+
6+
setwd("E:/data analytics/kaggle/titanic/data")
7+
8+
trainSet = read.csv("train.csv", header = TRUE, na.strings=c("NA",""))
9+
dim(trainSet)
10+
str(trainSet)
11+
head(trainSet)
12+
trainSet$Survived = factor(trainSet$Survived)
13+
trainSet$Pclass = factor(trainSet$Pclass)
14+
summary(trainSet)
15+
16+
missmap(trainSet, main="Titanic Training Data - Missings Map",
17+
col=c("yellow", "black"), legend=FALSE)
18+
19+
table(trainSet$Survived)
20+
ggplot(trainSet, aes(x = Survived)) + geom_bar()
21+
22+
#Comparing Survived and passenger class using table and histograms
23+
summary(trainSet$Pclass)
24+
xtabs(~Survived + Pclass, data=trainSet)
25+
ggplot(trainSet, aes(x = Survived, fill = Pclass)) + geom_bar()
26+
27+
#Comparing Survived and Sex using table and histograms
28+
summary(trainSet$Sex)
29+
xtabs(~Survived + Sex, data=trainSet)
30+
ggplot(trainSet, aes(x = Survived, fill = Sex)) + geom_bar()
31+
32+
33+
#Comparing Survived and Embarked using table and histograms
34+
summary(trainSet$Embarked)
35+
xtabs(~Survived + Embarked, data=trainSet)
36+
ggplot(trainSet, aes(x = Survived, fill = Embarked)) + geom_bar()
37+
38+
# Comparing Age and Survived: The boxplots are very similar between Age
39+
# for survivors and those who died.
40+
xtabs(~Survived + Age, data=trainSet)
41+
ggplot(trainSet, aes(x = Survived, y = Age)) + geom_boxplot()
42+
# Also, there are lots of NA's. Exclude this variable
43+
summary(trainSet$Age)
44+
45+
# Comparing Survived and Fare: The boxplots are much different between
46+
# fare for survivors and those who died.
47+
ggplot(trainSet, aes(x = Survived, y = Fare)) + geom_boxplot()
48+
# Also, there are no NA's. Include this variable.
49+
summary(trainSet$Fare)
50+
51+
# Comparing Survived and Parch
52+
ggplot(trainSet, aes(x = Survived, y = Parch)) + geom_boxplot()
53+
summary(trainSet$Parch)
54+
55+
# Set a random seed
56+
set.seed(42)
57+
58+
#model tuning strategy
59+
ctrl = trainControl(method = "cv", # Use cross-validation
60+
number = 10) # Use 10 folds for cross-validation
61+
62+
# Train the model using a "random forest" algorithm
63+
model_rf = train(Survived ~ Pclass + Sex + Age + Embarked + SibSp + Fare,
64+
data = trainSet,
65+
method = "rpart",
66+
trControl = ctrl)
67+
model_rf
68+
69+
testSet = read.table("test.csv", sep = ",", header = TRUE)
70+
dim(testSet)
71+
str(testSet)
72+
head(testSet)
73+
testSet$Pclass = factor(testSet$Pclass)
74+
summary(testSet)
75+
testSet$Fare = ifelse(is.na(testSet$Fare), mean(testSet$Fare, na.rm = TRUE), testSet$Fare)
76+
77+
78+
testSet$Survived = predict(model_logit, newdata = testSet)
79+
80+
submission = testSet[,c("PassengerId", "Survived")]
81+
82+
write.table(submission, file = "submission.csv", col.names = TRUE, row.names = FALSE, sep = ",")
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
library(ggplot2)
2+
library(dplyr)
3+
library(klaR)
4+
library(e1071)
5+
library(caret)
6+
library(Lock5Data)
7+
data(RestaurantTips)
8+
9+
dim(RestaurantTips)
10+
str(RestaurantTips)
11+
head(RestaurantTips)
12+
13+
ctrl = trainControl(method="cv", 10)
14+
15+
reg_model = train(Tip ~ ., data=RestaurantTips, method="lm", trControl=ctrl)
16+
reg_model
17+
reg_model$finalModel
18+
reg_model$finalModel$residuals
19+
20+
predicted = predict(reg_model, RestaurantTips)
21+
str(predicted)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
library(ggplot2)
2+
library(dplyr)
3+
library(klaR)
4+
library(e1071)
5+
library(caret)
6+
library(Lock5Data)
7+
data(RestaurantTips)
8+
9+
dim(RestaurantTips)
10+
str(RestaurantTips)
11+
head(RestaurantTips)
12+
13+
RestaurantTips$attr1 = RestaurantTips$Bill + RestaurantTips$Guests
14+
15+
RestaurantTips$attr2 = RestaurantTips$Bill + RestaurantTips$Guests + rnorm(157,0,1) *0.05
16+
17+
ctrl = trainControl(method="cv", 10)
18+
19+
reg_model = train(Tip ~ Bill + Guests + attr2, data=RestaurantTips, method="lm", trControl=ctrl)
20+
reg_model
21+
reg_model$finalModel
22+
23+
predicted = predict(reg_model, RestaurantTips)
24+
str(predicted)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
library(ggplot2)
2+
library(dplyr)
3+
library(klaR)
4+
library(e1071)
5+
library(caret)
6+
library(Lock5Data)
7+
data(RestaurantTips)
8+
9+
dim(RestaurantTips)
10+
str(RestaurantTips)
11+
head(RestaurantTips)
12+
13+
RestaurantTips$attr1 = RestaurantTips$Bill + RestaurantTips$Guests
14+
15+
RestaurantTips$attr2 = RestaurantTips$Bill + RestaurantTips$Guests + rnorm(157,0,1) *0.05
16+
17+
ctrl = trainControl(method="cv", 10)
18+
19+
reg_model1 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="lm", trControl=ctrl)
20+
21+
reg_model2 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="ridge", trControl=ctrl)
22+
23+
reg_model3 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="lasso", trControl=ctrl)
24+
25+
reg_model1
26+
reg_model1$finalModel
27+
28+
reg_model2
29+
reg_model2$finalModel
30+
31+
reg_model3
32+
reg_model3$finalModel

0 commit comments

Comments
 (0)