文本清理
docs <- Corpus(VectorSource(files))writeLines(as.character(docs[[30]]))
docs <-tm_map(docs,content_transformer(tolower))toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x)) } ) docs <- tm_map(docs, toSpace, "-") docs <- tm_map(docs, toSpace, "’") docs <- tm_map(docs, toSpace, "‘") docs <- tm_map(docs, toSpace, "•") docs <- tm_map(docs, toSpace, "”") docs <- tm_map(docs, toSpace, "“")docs <- tm_map(docs, removePunctuation) docs <- tm_map(docs, removeNumbers) docs <- tm_map(docs, stripWhitespace)docs <- tm_map(docs, removeWords, stopwords("english"))myStopwords <- c("can", "it","may","might","great","kind") docs <- tm_map(docs, removeWords, myStopwords)docs <- tm_map(docs,stemDocument)
Last updated