--- title: "Classification" output: pdf_document: fig_height: 3 --- # Data Set -- Iris ```{r} # load data data(iris) # this is what it looks like... head(iris) # ...and this is what it looks like plotted library(ggplot2) ggplot(iris, aes(Petal.Length, Petal.Width)) + geom_point(aes(color = Species, shape = Species)) ``` # Logistic Regression ```{r} library(mlr) # Create task and learner # Logistic regression can only handle two classes, so subset data accordingly iris2 = iris[51:150,] task = makeClassifTask(data = iris2, target = "Species") learner = makeLearner("classif.logreg") # split the data into train and test set n = nrow(iris2) train.set = sample(n, size = 2/3*n) test.set = setdiff(1:n, train.set) # train a model model = train(learner, task, subset = train.set) model # now predict on the test set predictions = predict(model, task = task, subset = test.set) predictions # How did we do? performance(predictions, measures = acc) calculateConfusionMatrix(predictions) # What does the learned model look like? getLearnerModel(model) # let's plot the predictions plotLearnerPrediction(learner, task, measures = acc, features = c("Petal.Length", "Petal.Width")) ``` ## Predicting Probabilities ```{r} learner = makeLearner("classif.logreg", predict.type = "prob") model = train(learner, task, subset = train.set) predictions = predict(model, task = task, subset = test.set) predictions # plot how performance changes if we move the threshold for the classes d = generateThreshVsPerfData(predictions, measures = acc) plotThreshVsPerf(d) ``` ## Using Resampling ```{r} # mlr can do the partitioning into train and test set automatically rdesc = makeResampleDesc(method = "Holdout", split = 2/3) result = resample(learner, task, rdesc, measures = acc, models = TRUE) # get predictions getRRPredictions(result) # get model getLearnerModel(result$models[[1]]) ``` # Linear Discriminant Analysis ```{r} task = makeClassifTask(data = iris, target = "Species") learner = makeLearner("classif.lda") result = resample(learner, task, rdesc, measures = acc, models = TRUE) plotLearnerPrediction(learner, task, measures = acc, features = c("Petal.Length", "Petal.Width")) getLearnerModel(result$models[[1]]) ``` # Support Vector Machines ```{r} learner = makeLearner("classif.ksvm") result = resample(learner, task, rdesc, measures = acc, models = TRUE) getLearnerModel(result$models[[1]]) plotLearnerPrediction(learner, task, measures = acc, features = c("Petal.Length", "Petal.Width")) ``` ## Different Kernel ```{r} learner = makeLearner("classif.ksvm", par.vals = list(kernel = "vanilladot")) result = resample(learner, task, rdesc, measures = acc, models = TRUE) getLearnerModel(result$models[[1]]) plotLearnerPrediction(learner, task, measures = acc, features = c("Petal.Length", "Petal.Width")) ``` # Classification Trees ```{r} learner = makeLearner("classif.rpart") result = resample(learner, task, rdesc, measures = acc, models = TRUE) plotLearnerPrediction(learner, task, measures = acc, features = c("Petal.Length", "Petal.Width")) getLearnerModel(result$models[[1]]) # this is not part of mlr library(rpart.plot) rpart.plot(getLearnerModel(result$models[[1]])) ``` # Random Forests ```{r} learner = makeLearner("classif.randomForest") result = resample(learner, task, rdesc, measures = acc, models = TRUE) getLearnerModel(result$models[[1]]) plotLearnerPrediction(learner, task, measures = acc, features = c("Petal.Length", "Petal.Width")) ``` # More ```{r} listLearners(task)$class listMeasures(task) ```