Janta Hack – Analytics Vidhya R code

install.packages(“stringr”)
library(stringr)

data = read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/train.csv”)
head(data)
str(data)

data$product <- str_count(data$ProductList,”;”)+1
head(data)
data$hours <- with(data, difftime(endTime,startTime,units=”hours”) )
data$min <- with(data, difftime(endTime,startTime,units=”mins”) )
data$x <- as.double(data$endTime – data$startTime, units = “mins”)

table(data$product)
hist(data$product)

table(data$gender)
count <- table(data$gender,data$product)
barplot(count)
str(data)
head(data$ET)
head(data$endTime)

date1 = as.POSIXlt(’16/12/14 14:41′,format=”%Y-%m%dT%H:%M:%S”)
date2 = as.POSIXlt(‘2015-10-05T22:43:00.000’,format=”%Y-%m-%dT%H:%M:%S”)
install.packages(“lubridate”)
library(lubridate)
year(date1)
month(date1)
day(date1)
hour(date1)

data$date <- substr(data$startTime,1,2)
head(data)

#Merge Train and Test Data Set
test <- read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/test.csv”)
df_test <- as.data.frame(append(test,list(gender=0),after = 4))
head(df_test)

data$gender_num <- ifelse()

data_x <- read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/train.csv”)
data_x$G <- ifelse(data_x$gender==’male’,1,0)
head(data_x)
data_x = subset(data_x,select=-c(gender))

data_test <- read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/test.csv”)
df_data_test <- as.data.frame(append(data_test,list(G=0),after=4))
head(df_data_test)

df_Janta <- rbind(data_x,df_data_test)
df_Janta$Product <- str_count(df_Janta$ProductList,”;”)+1
head(df_Janta)

#In trainig datset we have 8192 females and 2308 male
table(data_x$G)

#In total there has been 7934 single purchase
table(df_Janta$Product)

df_Janta$first <- substr(df_Janta$ProductList,21,6)
str(df_Janta)
df_Janta$ProductList <- as.character(df_Janta$ProductList)
df_Janta$x <- substr(df_Janta$ProductList,21,6)
df_Janta$x
str(df_Janta)

first = sapply(df_Janta$ProductList,function(x) {
if(substr(x,1,6) != ”){
return(substr(x,1,6))
}
else {

return("Null")

}

}
)

table(first)
table(first,df_Janta$G)

second = sapply(df_Janta$ProductList,function(x){

return(substr(x,7,6))

})
table(second)

train_f <- read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/train.csv”)
head(train_f)
test_f <- read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/test.csv”)
head(test_f)

str(train_f)
test_f <- as.data.frame(append(test_f,list(gender=0),after=4))
str(test_f)

both <- rbind(train_f,test_f)

#Adding number of products
both$no_prod <- str_count(both$ProductList,”;”)+1
str(both)

both$gender <- as.factor(both$gender)
both$gb_p <- as.factor(both$gb_p)
both$gb_p2 <- as.factor(both$gb_p2)
both$gb_1 <- as.factor(both$gb_1)
both$sum <- as.factor(both$sum)
both$sum_gb <- as.factor(both$sum_gb)
str(both)

traindata <- both[1:10500,]
testdata <- both[10501:15000,]

model_log <- glm(gender ~ gb_p+gb_p2+gb_1+sum+sum_gb+no_prod,data = traindata,family = binomial)
summary(model_log)

x <- predict(model_log,testdata)
sub <- cbind(testdata$session_id,x)

write.csv(sub,”C:/Users/User/Desktop/Hackathon/JantaHack/submit_lm.csv”)

install.packages(“caret”)
install.packages(“e1071”)
library(caret)
library(e1071)
set.seed(101)

tuned = tune.svm(gender~ gb_p+gb_p2+gb_1+sum+sum_gb+no_prod , data = traindata, gamma = seq(.1,0.5,0.1), cost = seq(1,60,10))
tuned$best.parameters

model_svm <- svm(gender~ gb_p+gb_p2+gb_1+sum+sum_gb+no_prod , data = traindata, gamma = 0.1, cost = 1, type = “C-classification”)

summary(model_svm)

svm_pred <- predict(model_svm,testdata,type=”response”)

fin_svm <- cbind(testdata$session_id,svm_pred)
write.csv(fin_svm,”C:/Users/User/Desktop/Hackathon/JantaHack/submit_svm.csv”)

model_lin <- lm(gender~ no_prod , data = traindata)
summary(model_lin)
lm_pred <- predict(model_lin,testdata)
head(testdata)
pred_lm <- cbind(testdata$session_id,lm_pred)
head(pred_lm)
table(lm_pred)
write.csv(pred_lm,”C:/Users/User/Desktop/Hackathon/JantaHack/submit_lin.csv”)

install.packages(“randomForest”)
library(randomForest)

model_rf <- randomForest(gender~ gb_p+gb_p2+gb_1+sum+sum_gb+no_prod , data = traindata)
model_rf

pred_rf <- predict(model_rf,testdata)
sub_rf <- cbind(testdata$session_id,pred_rf)

write.csv(sub_rf,”C:/Users/User/Desktop/Hackathon/JantaHack/submit_rf.csv”)

head(train_f)

train_f$Str <- as.String(train_f$ProductList)

library(xgboost)

Author: TheDataMonk

I am the Co-Founder of The Data Monk. I have a total of 6+ years of analytics experience 3+ years at Mu Sigma 2 years at OYO 1 year and counting at The Data Monk I am an active trader and a logically sarcastic idiot :)