Skip to content

Commit ae3aefb

Browse files
author
algorithmica-repository
committed
uploading the template for preprocessing
1 parent 157f2cb commit ae3aefb

File tree

1 file changed

+99
-0
lines changed

1 file changed

+99
-0
lines changed
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
library(caret)
2+
library(randomForest)
3+
library(ggplot2)
4+
library(outliers)
5+
6+
setwd("E:/data analytics/kaggle/springleaf")
7+
8+
//Read the dataset from train file
9+
trainSet = read.csv("train.csv", header = TRUE, na.strings=c("NA","","-1","-99999"))
10+
dim(trainSet)
11+
str(trainSet)
12+
head(trainSet)
13+
14+
set.seed(42)
15+
trainSet = trainSet[sample(nrow(trainSet),20000),]
16+
dim(trainSet)
17+
18+
//Remove the variables which have 95% NAs
19+
threshold_val = 0.95 * dim(trainSet)[1]
20+
include_cols = !apply(trainSet, 2, function(y) sum(is.na(y)) > threshold_val)
21+
trainSet = trainSet[, include_cols]
22+
23+
//Remove the variables which have very less variance
24+
nearZvar = nearZeroVar(trainSet, saveMetrics = TRUE)
25+
trainSet = trainSet[ ,nearZvar$nzv==FALSE]
26+
27+
//Remove the variables which are highly correlated
28+
corr_matrix = abs(cor(trainSet[,-dim(trainSet)[2]]))
29+
diag(corr_matrix) = 0
30+
correlated_col = findCorrelation(corr_matrix, verbose = FALSE , cutoff = .95)
31+
trainSet = trainSet[, -c(correlated_col)]
32+
33+
34+
35+
36+
37+
# Set a random seed
38+
39+
40+
#model tuning strategy
41+
ctrl = trainControl(method = "cv", # Use cross-validation
42+
number = 10) # Use 10 folds for cross-validation
43+
44+
model_dt = train(Survived ~ Pclass + Sex + Embarked + Fare,
45+
data = trainSet,
46+
method = "rpart",
47+
trControl = ctrl)
48+
model_dt$finalModel
49+
model_dt
50+
51+
model_dt = train(Survived ~ Pclass + Sex + Embarked + Fare,
52+
data = trainSet,
53+
method = "rpart",
54+
trControl = ctrl,
55+
tuneLength=10)
56+
model_dt$finalModel
57+
model_dt
58+
59+
grid=data.frame(.cp=c(0,0.1,0.6))
60+
model_dt = train(Survived ~ Pclass + Sex + Embarked + Fare,
61+
data = trainSet,
62+
method = "rpart",
63+
trControl = ctrl,
64+
tuneGrid = grid)
65+
model_dt$finalModel
66+
model_dt
67+
68+
model_dt = train(Survived ~ Pclass + Sex + Age + Embarked + SibSp + Fare,
69+
data = trainSet, control=rpart.control(minsplit=2),
70+
method = "rpart",
71+
trControl = ctrl)
72+
model_dt$finalModel
73+
model_dt
74+
75+
model_dt = train(Survived ~ Pclass + Sex + Age + Embarked + SibSp + Fare,
76+
data = trainSet,
77+
method = "J48",
78+
trControl = ctrl)
79+
model_dt$finalModel
80+
81+
82+
testSet = read.table("test.csv", sep = ",", header = TRUE)
83+
dim(testSet)
84+
str(testSet)
85+
head(testSet)
86+
testSet$Pclass = factor(testSet$Pclass)
87+
summary(testSet)
88+
testSet$Fare = ifelse(is.na(testSet$Fare), mean(testSet$Fare, na.rm = TRUE), testSet$Fare)
89+
90+
91+
testSet$Survived = predict(model_dt, newdata = testSet)
92+
93+
submission = testSet[,c("PassengerId", "Survived")]
94+
95+
write.table(submission, file = "submission.csv", col.names = TRUE, row.names = FALSE, sep = ",")
96+
97+
library()
98+
search()
99+
ls("package:base")

0 commit comments

Comments
 (0)