""" 根据url获取今日头条的文章等信息,有些字符还是无法去除干净, """ def getArticleFromUrl(url): try: resp = urllib.request.urlopen(url) except Exception: print('url open error') return None respHtml = resp.read() resp.close() soup = BeautifulSoup(respHtml,from_encoding="utf-8") content='null' scripts = soup.find_all('script') for sc in scripts: #sc 是bs4.element.Tag,name是名字, line = str(sc) if line.startswith('<script>var BASE_DATA ='): lineRes = line.replace('<script>var BASE_DATA =', '').replace(';</script>', '').replace('.replace(/<br \/>|\\n|\\r/ig, \'\')', '').replace('.replace(/<br \/>/ig, \'\')', '') res = lineRes.replace(' ', '').replace(' ', '') vs = res.split('\n') chineseTag = vs[14] title = vs[19] content = re.sub('<[^<]*>|&[a-z]+\073|\/p','',vs[20]) abstract = vs[33] return content
""" 根据给定的url集合,获取今日头条的文章等信息, 集合从一个txt文件中获得,返回对应的文章等信息, 写入txt文件 """ def getArticlesFromUrls(urlsPath,savefile): f = open(urlsPath,encoding='utf-8') save = open(savefile,"a",encoding='utf-8') line = f.readline() while 1: if not line: break else: line = line.replace('\n','') content = getArticleFromUrl(line) if content is not None: res = (line+'\t'+content).replace('\u200d','').replace('\xa3','').replace('\ue20c','') else: res = line+'\t'+'null' #得到内容写进txt文件 save.write(res+'\n') line = f.readline() save.close()