getDocumentTermMatrix <- function (text.vector, document.names=NULL) { # Function to convert a vector of charachters into a document-term matrix # ARGS # text.vector - A vector of type character. Each element should be a document # document.names - A vector containing the names to be assigned to the documents. If not provided # default names are used # # RETURN # The function returns a list with two elements, the document-term matrix and a list of words # # First step, split the documents using blank space split <- strsplit(text.vector, " ") # Determine the words in the documents and remove empty strings words <- unique(unlist(split)) words <- words[words!=""] # Now we will count the number of apparitions of each word in each document, using a couple of functions # The first one counts the apparitions of a word in a text, the second one uses that to get the counts # for the whole list of words countApparitions <- function(word, word.vector) { return(sum(word.vector==word)) } countWords <- function(word.vector) { sapply(words, FUN=countApparitions, word.vector=word.vector) } # Apply the function to all the documents aux <- lapply(split, FUN=countWords) matrix <- do.call(rbind, aux) # Set the names of the documents if(is.null(document.names)) { rownames(matrix) <- paste("Doc", 1:nrow(matrix), sep="") }else{ if (length(document.names)!=length(text.vector)) { stop("The length of the two vectors provided has to be the same") } rownames(matrix) <- document.names } return(list(doc.term.matrix=matrix, words=words)) } # Example of use with some tweets tweets <- read.csv("tweets_CEC2017.csv", stringsAsFactors=FALSE) res <- getDocumentTermMatrix(tweets[,1]) head(res$doc.term.matrix) res$words