def reReadDefineDictionarysFromFile(env:ExecutionEnvironment,stopWordPath:String,newWordPath:String):Unit={ //CustomDictionary, val mustSeg = env.readTextFile(newWordPath).collect() .map { wordAndTag => val word = wordAndTag.split("\t").apply(0) val tag = wordAndTag.split("\t").apply(1) CustomDictionary.insert(word, tag) //println(word) //CustomDictionary.add(word) } //mustSeg.count() //executor //CoreStopWordDictionary //word \t tag val mustSop = env.readTextFile(stopWordPath).collect() .map{ wordAndTag => val word = wordAndTag.split("\t").apply(0) CoreStopWordDictionary.add(word) } //mustSop.count() //executor
/**单个串分词**/ def segFromString(str:String):List[Term]={ val segment = HanLP.newSegment()//.enableOrganizationRecognize(true) val termList = segment.seg(str) termList.asScala.toList }
/**文档集分词**/ def segFromString(data:DataSet[(String,String)]):DataSet[(String,List[Term])]={ val res = data.map{ x=> val segment = HanLP.newSegment() val termList = segment.seg(x._2).asScala.toList (x._1,termList) } res }