R3.2.0
Rstudio
#加载包library(dplyr)library(tm)library(wordcloud)library(Rwordseg)
#读取文档wb<-read.csv(file.choose(),T)
#插入航空词典insertWords(strwords = words_p)
#分词wb$评论<-as.character(wb$评论)#剔除干扰分词的词pattern <- c('还是','很','也','了','可以','还','是','都','相当','大家','确实','非常','应该', '蛮','整体','里面','就','实在','总体','听说','比较','都是','够','还算','极其','也算','太','算是')pattern2 <- paste('[',paste(pattern,collapse = ','),']', sep = '')wb$评论<- gsub(pattern = pattern, replacement = '', x = wb$评论)rm(pattern,pattern2)#剔除评论数据中含有的英文和数字wb$评论<- gsub('[a-zA-Z0-9]','',wb$评论)sentence <- as.vector(wb$评论) #文本内容转化为向量sentence train.test<- wb[!nchar(sentence) < 4, ]row.names(train.test)<-1:23581rm(wb)rm(sentence)segword<-segmentCN(strwords = train.test$评论)
#去除停用词mystopwords=read.table(file.choose(),stringsAsFactors = FALSE)#读取停用词mystopwords=as.vector(mystopwords[,1])#转换数据形式removewords<-function(target_words,stop_words){ target_words=target_words[target_words%in%stop_words==FALSE] return(target_words)}segword2=sapply(X=segword,FUN=removewords,mystopwords)
#画词云图图opar <- par(no.readonly = TRUE)par(bg = 'black')wordcloud(words = word_freq2$Word, freq = word_freq2$Freq, scale = c(4,0.1), max.words = 50, random.color = TRUE, colors = rainbow(n = 7))par(opar)
1,文本分词是很重要的,需要不断增加减词,已达到分词的准确性,由于篇幅限制,再具体的未展示