python爬虫-简单开始

1. json的解析和使用
2. 使用BeautifulSoup
3. 爬取今日头条
4. 爬取京东评论

使用python爬虫，读取的是xml源码，从源码中抽取所需的信息，使用到BeautifulSoup，以及json的解析和转换，还有编码问题。

json的解析和使用

在python中对json的操作，基于json组件，依赖dict类型，即字典。

import json
b= 'b'
c='c'
a='a'
data1 = '{b: 789, c: 456, a: 123}'
dictData = eval(data1)#eval是把串转为dict
print(type(dictData),dictData)
print(type(dictData))
encode_json = json.dumps(dictData)#dumps是把字典转为str串
print (type(encode_json), encode_json)

decode_json = json.loads(encode_json)#loads将str串转为dict形式，注意key必须用""，双冒号，
print (type(decode_json))
print (decode_json['a'])
print (decode_json)

使用BeautifulSoup

xml源码，用了很多属性items，需要对其进行解析，获取节点，以及子节点。一下是两个曾经使用过的例子。其他的使用文档。

爬取今日头条

根据今日头条的url获取xml的源码，查看源码的内容，需要获取的内容如下：

var BASE_DATA={
  userInfo: {
     id: 0,
     userName: '',
     avatarUrl: '',
     isPgc: false,
     isOwner: false
   },
   headerInfo: {
     id: 0,
     isPgc: false,
     userName: '',
     avatarUrl: '',
     isHomePage: false,
     chineseTag: '房产',
     crumbTag: 'search/?keyword=%E6%88%BF%E4%BA%A7',
     hasBar: true
   },
   articleInfo: {
     title: '开学季已至，徐州中小学生开始报到 教育难题你考虑解决吗',
     content: '&lt;p
     ....
}

以下是python实现爬取的代码，


import urllib
from bs4 import BeautifulSoup
import json
import re

invalid_escape = re.compile(r'\\[0-7]{1,3}')  # up to 3 digits for byte values up to FF

def replace_with_byte(match):
    return chr(int(match.group(0)[1:], 8))

def repair(brokenjson):
    return invalid_escape.sub(replace_with_byte, brokenjson)        

"""
根据url获取今日头条的文章等信息，有些字符还是无法去除干净，
"""
def getArticleFromUrl(url):
    try:
        resp = urllib.request.urlopen(url)
    except Exception:
        print('url open error')
        return None
    respHtml = resp.read()
    resp.close()
    soup = BeautifulSoup(respHtml,from_encoding="utf-8")
    content='null'
    scripts = soup.find_all('script')
    for sc in scripts: #sc 是bs4.element.Tag,name是名字，
        line = str(sc)
        if line.startswith('<script>var BASE_DATA ='):
            lineRes = line.replace('<script>var BASE_DATA =', '').replace(';</script>', '').replace('.replace(/<br \/>|\\n|\\r/ig, \'\')', '').replace('.replace(/<br \/>/ig, \'\')', '')
            res = lineRes.replace('    ', '').replace('  ', '')
            vs = res.split('\n')
            chineseTag = vs[14]
            title = vs[19]
            content = re.sub('<[^<]*>|&[a-z]+\073|\/p','',vs[20])
            abstract = vs[33]
    return content    

"""
根据给定的url集合，获取今日头条的文章等信息，
集合从一个txt文件中获得，返回对应的文章等信息，
写入txt文件
"""
def getArticlesFromUrls(urlsPath,savefile):
    f = open(urlsPath,encoding='utf-8')
    save = open(savefile,"a",encoding='utf-8')
    line = f.readline()
    while 1:
        if not line:
            break
        else:
            line = line.replace('\n','')
            content = getArticleFromUrl(line)
            if content is not None:
                res = (line+'\t'+content).replace('\u200d','').replace('\xa3','').replace('\ue20c','')
            else:
                res = line+'\t'+'null'
            #得到内容写进txt文件
            save.write(res+'\n')
            line = f.readline()
    save.close()

#根据url集合获得文章信息
urlsPath=r'E:\workspace\data\JRtoutiaoArticle\urls.txt'
savefile=r'E:\workspace\data\JRtoutiaoArticle\urls_articles.txt'

getArticlesFromUrls(urlsPath,savefile)

爬取京东评论

#coding=utf-8

'''
抓取京东商品的评论数据；
decode的作用是将其他编码的字符串转换成unicode编码，如str1.decode('gb2312')，表示将gb2312编码的字符串str1转换成unicode编码。
encode的作用是将unicode编码转换成其他编码的字符串，如str2.encode('gb2312')，表示将unicode编码的字符串str2转换成gb2312编码。
'''
import urllib2
import re,bs4
from bs4 import BeautifulSoup

url="http://club.jd.com/review/1721312-1-1-0.html"#1721312
四个数好3、中2、差1、全部0、有图片4
req = urllib2.Request(url)
resp = urllib2.urlopen(req)
respHtml = resp.read()
resp.close()
soup = BeautifulSoup(respHtml,from_encoding="gbk")
#得到总的页数
Page=soup.find("div",class_="pagin fr")
conts=(str(Page).replace('<div class="pagin fr">','').replace('</div>',''))#得到包含总页数的那段网页源码内容
totlePage=0
try:
     totlePage=int(conts.split('</a>')[3].split('>')[3])#得到总页数的那个值
except Exception, e:
    print('the productor is not have evaluates')
# print(totlePage)
############################################################################
#找到该类别下的所有评论
# for currPage in range(totlePage):
#      print(currPage)
#      newurl="http://club.jd.com/review/1721312-1-"+str(currPage+1)+"-1.html"
#      req = urllib2.Request(newurl)
#      resp = urllib2.urlopen(req)
#      respHtml = resp.read()
#      resp.close()
#      soup = BeautifulSoup(respHtml,from_encoding="gbk")
#      m = soup.findAll("div",{'class':"comment-content"})
#      #存入txt文件
#      import codecs
#      evaluate_z=open("tr_c.txt","a")
#      for obj in m:
#             con=''.join(obj.find('dd').contents).replace('\n','')
#             print con
#             evaluate_z.write(con.encode('utf8')+'\t'+'class=bad'+'\n')
#      evaluate_z.close()

############################################################################
#爬取全部的，
evaluate_z=open("tr_all.txt","a")
for currPage in range(totlePage):
     print(currPage)
     newurl="http://club.jd.com/review/1721312-1-"+str(currPage+1)+"-0.html"
     req = urllib2.Request(newurl)
     resp = urllib2.urlopen(req)
     respHtml = resp.read()
     resp.close()
     soup = BeautifulSoup(respHtml,from_encoding="gbk")
     m = soup.findAll("div",{'class':"i-item"})
     for obj in m:
        star=str(obj.find('span'))[20:21]#获得星标
        content=str(obj.find('dd')).replace('\n','').replace('<dd>','').replace('</dd>','')#获得评价内容
        print content+'\t'+star
        evaluate_z.write(content+'\t'+'star='+star+'\n')
evaluate_z.close()

山上掏金

每天早上起床就是为了比昨天更快乐，掏金者的一天是新的开始.

python爬虫-简单开始

json的解析和使用

使用BeautifulSoup

爬取今日头条

爬取京东评论