install.packages(“stringr”)
library(stringr)
data = read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/train.csv”)
head(data)
str(data)
data$product <- str_count(data$ProductList,”;”)+1
head(data)
data$hours <- with(data, difftime(endTime,startTime,units=”hours”) )
data$min <- with(data, difftime(endTime,startTime,units=”mins”) )
data$x <- as.double(data$endTime – data$startTime, units = “mins”)
table(data$product)
hist(data$product)
table(data$gender)
count <- table(data$gender,data$product)
barplot(count)
str(data)
head(data$ET)
head(data$endTime)
date1 = as.POSIXlt(’16/12/14 14:41′,format=”%Y-%m%dT%H:%M:%S”)
date2 = as.POSIXlt(‘2015-10-05T22:43:00.000’,format=”%Y-%m-%dT%H:%M:%S”)
install.packages(“lubridate”)
library(lubridate)
year(date1)
month(date1)
day(date1)
hour(date1)
data$date <- substr(data$startTime,1,2)
head(data)
#Merge Train and Test Data Set
test <- read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/test.csv”)
df_test <- as.data.frame(append(test,list(gender=0),after = 4))
head(df_test)
data$gender_num <- ifelse()
data_x <- read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/train.csv”)
data_x$G <- ifelse(data_x$gender==’male’,1,0)
head(data_x)
data_x = subset(data_x,select=-c(gender))
data_test <- read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/test.csv”)
df_data_test <- as.data.frame(append(data_test,list(G=0),after=4))
head(df_data_test)
df_Janta <- rbind(data_x,df_data_test)
df_Janta$Product <- str_count(df_Janta$ProductList,”;”)+1
head(df_Janta)
#In trainig datset we have 8192 females and 2308 male
table(data_x$G)
#In total there has been 7934 single purchase
table(df_Janta$Product)
df_Janta$first <- substr(df_Janta$ProductList,21,6)
str(df_Janta)
df_Janta$ProductList <- as.character(df_Janta$ProductList)
df_Janta$x <- substr(df_Janta$ProductList,21,6)
df_Janta$x
str(df_Janta)
first = sapply(df_Janta$ProductList,function(x) {
if(substr(x,1,6) != ”){
return(substr(x,1,6))
}
else {
return("Null")
}
}
)
table(first)
table(first,df_Janta$G)
second = sapply(df_Janta$ProductList,function(x){
return(substr(x,7,6))
})
table(second)
train_f <- read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/train.csv”)
head(train_f)
test_f <- read.csv(“C:/Users/User/Desktop/Hackathon/JantaHack/test.csv”)
head(test_f)
str(train_f)
test_f <- as.data.frame(append(test_f,list(gender=0),after=4))
str(test_f)
both <- rbind(train_f,test_f)
#Adding number of products
both$no_prod <- str_count(both$ProductList,”;”)+1
str(both)
both$gender <- as.factor(both$gender)
both$gb_p <- as.factor(both$gb_p)
both$gb_p2 <- as.factor(both$gb_p2)
both$gb_1 <- as.factor(both$gb_1)
both$sum <- as.factor(both$sum)
both$sum_gb <- as.factor(both$sum_gb)
str(both)
traindata <- both[1:10500,]
testdata <- both[10501:15000,]
model_log <- glm(gender ~ gb_p+gb_p2+gb_1+sum+sum_gb+no_prod,data = traindata,family = binomial)
summary(model_log)
x <- predict(model_log,testdata)
sub <- cbind(testdata$session_id,x)
write.csv(sub,”C:/Users/User/Desktop/Hackathon/JantaHack/submit_lm.csv”)
install.packages(“caret”)
install.packages(“e1071”)
library(caret)
library(e1071)
set.seed(101)
tuned = tune.svm(gender~ gb_p+gb_p2+gb_1+sum+sum_gb+no_prod , data = traindata, gamma = seq(.1,0.5,0.1), cost = seq(1,60,10))
tuned$best.parameters
model_svm <- svm(gender~ gb_p+gb_p2+gb_1+sum+sum_gb+no_prod , data = traindata, gamma = 0.1, cost = 1, type = “C-classification”)
summary(model_svm)
svm_pred <- predict(model_svm,testdata,type=”response”)
fin_svm <- cbind(testdata$session_id,svm_pred)
write.csv(fin_svm,”C:/Users/User/Desktop/Hackathon/JantaHack/submit_svm.csv”)
model_lin <- lm(gender~ no_prod , data = traindata)
summary(model_lin)
lm_pred <- predict(model_lin,testdata)
head(testdata)
pred_lm <- cbind(testdata$session_id,lm_pred)
head(pred_lm)
table(lm_pred)
write.csv(pred_lm,”C:/Users/User/Desktop/Hackathon/JantaHack/submit_lin.csv”)
install.packages(“randomForest”)
library(randomForest)
model_rf <- randomForest(gender~ gb_p+gb_p2+gb_1+sum+sum_gb+no_prod , data = traindata)
model_rf
pred_rf <- predict(model_rf,testdata)
sub_rf <- cbind(testdata$session_id,pred_rf)
write.csv(sub_rf,”C:/Users/User/Desktop/Hackathon/JantaHack/submit_rf.csv”)
head(train_f)
train_f$Str <- as.String(train_f$ProductList)
library(xgboost)