|
| 1 | +library(caret) |
| 2 | +library(randomForest) |
| 3 | +library(ggplot2) |
| 4 | +library(outliers) |
| 5 | + |
| 6 | +setwd("E:/data analytics/kaggle/springleaf") |
| 7 | + |
| 8 | +//Read the dataset from train file |
| 9 | +trainSet = read.csv("train.csv", header = TRUE, na.strings=c("NA","","-1","-99999")) |
| 10 | +dim(trainSet) |
| 11 | +str(trainSet) |
| 12 | +head(trainSet) |
| 13 | + |
| 14 | +set.seed(42) |
| 15 | +trainSet = trainSet[sample(nrow(trainSet),20000),] |
| 16 | +dim(trainSet) |
| 17 | + |
| 18 | +//Remove the variables which have 95% NAs |
| 19 | +threshold_val = 0.95 * dim(trainSet)[1] |
| 20 | +include_cols = !apply(trainSet, 2, function(y) sum(is.na(y)) > threshold_val) |
| 21 | +trainSet = trainSet[, include_cols] |
| 22 | + |
| 23 | +//Remove the variables which have very less variance |
| 24 | +nearZvar = nearZeroVar(trainSet, saveMetrics = TRUE) |
| 25 | +trainSet = trainSet[ ,nearZvar$nzv==FALSE] |
| 26 | + |
| 27 | +//Remove the variables which are highly correlated |
| 28 | +corr_matrix = abs(cor(trainSet[,-dim(trainSet)[2]])) |
| 29 | +diag(corr_matrix) = 0 |
| 30 | +correlated_col = findCorrelation(corr_matrix, verbose = FALSE , cutoff = .95) |
| 31 | +trainSet = trainSet[, -c(correlated_col)] |
| 32 | + |
| 33 | + |
| 34 | + |
| 35 | + |
| 36 | + |
| 37 | +# Set a random seed |
| 38 | + |
| 39 | + |
| 40 | +#model tuning strategy |
| 41 | +ctrl = trainControl(method = "cv", # Use cross-validation |
| 42 | + number = 10) # Use 10 folds for cross-validation |
| 43 | + |
| 44 | +model_dt = train(Survived ~ Pclass + Sex + Embarked + Fare, |
| 45 | + data = trainSet, |
| 46 | + method = "rpart", |
| 47 | + trControl = ctrl) |
| 48 | +model_dt$finalModel |
| 49 | +model_dt |
| 50 | + |
| 51 | +model_dt = train(Survived ~ Pclass + Sex + Embarked + Fare, |
| 52 | + data = trainSet, |
| 53 | + method = "rpart", |
| 54 | + trControl = ctrl, |
| 55 | + tuneLength=10) |
| 56 | +model_dt$finalModel |
| 57 | +model_dt |
| 58 | + |
| 59 | +grid=data.frame(.cp=c(0,0.1,0.6)) |
| 60 | +model_dt = train(Survived ~ Pclass + Sex + Embarked + Fare, |
| 61 | + data = trainSet, |
| 62 | + method = "rpart", |
| 63 | + trControl = ctrl, |
| 64 | + tuneGrid = grid) |
| 65 | +model_dt$finalModel |
| 66 | +model_dt |
| 67 | + |
| 68 | +model_dt = train(Survived ~ Pclass + Sex + Age + Embarked + SibSp + Fare, |
| 69 | + data = trainSet, control=rpart.control(minsplit=2), |
| 70 | + method = "rpart", |
| 71 | + trControl = ctrl) |
| 72 | +model_dt$finalModel |
| 73 | +model_dt |
| 74 | + |
| 75 | +model_dt = train(Survived ~ Pclass + Sex + Age + Embarked + SibSp + Fare, |
| 76 | + data = trainSet, |
| 77 | + method = "J48", |
| 78 | + trControl = ctrl) |
| 79 | +model_dt$finalModel |
| 80 | + |
| 81 | + |
| 82 | +testSet = read.table("test.csv", sep = ",", header = TRUE) |
| 83 | +dim(testSet) |
| 84 | +str(testSet) |
| 85 | +head(testSet) |
| 86 | +testSet$Pclass = factor(testSet$Pclass) |
| 87 | +summary(testSet) |
| 88 | +testSet$Fare = ifelse(is.na(testSet$Fare), mean(testSet$Fare, na.rm = TRUE), testSet$Fare) |
| 89 | + |
| 90 | + |
| 91 | +testSet$Survived = predict(model_dt, newdata = testSet) |
| 92 | + |
| 93 | +submission = testSet[,c("PassengerId", "Survived")] |
| 94 | + |
| 95 | +write.table(submission, file = "submission.csv", col.names = TRUE, row.names = FALSE, sep = ",") |
| 96 | + |
| 97 | +library() |
| 98 | +search() |
| 99 | +ls("package:base") |
0 commit comments