站浏览量 站访问人数
目录
  1. 1. 汽车之家
  2. 2. 腾讯动漫
  3. 3. 猫眼
  4. 4. 365电视剧网
  5. 5. 六一儿童网

通过python爬取各大门户网站的数据,比如儿童/腾讯动漫等。当作是实践python,写写程序。主要是找出源码中html的位置,通过bs4解析,找到参数值/属性值。

汽车之家

像汽车之家,http://car.qichedaquan.com/carmaster/KR627009。
后面的是汽车品牌的拼音缩写,另外,可以看到在html源码中,页面显示的汽车title存储在a的class=brand_title,因此我们可以解析出来:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import urllib2
from bs4 import BeautifulSoup
import numpy as np

'''
short_names=['AD','ASDEMD','AEFNLMO','ARCFOX','ACQC'
,'BT','BK','BM599336','BJ631837','BC209179','BYD','BZ443884','BT866387','BJ','BSJ643546'
,'BQHS','BQSB','CH451637','BL72846','BQXNY','BQWW','BQZZ','BW','BSQC','BBS'
,'BJD','BQRL-1157793070','BYTON','BJQX','CAJC642117','CASY53411','CAQXC'
,'CC539565','CAKY365949','CG18634','DZ107480','DFFG828649'
,'DFFX326251','DN253165','DFFS832078','DFXK691672','DQ45846','DS','DFFZZRC'
,'DFYF649334','DFFD436608','DF313869','DFRTT','FT224270','FT','FT90426','FLL103610'
,'FYT197920','FD520289','FQQT19753','GQCS','GZQC323132','GQXNYQCYXGS','GMC400790'
,'GQJA','GQZX113564','GJQC','HF','HQ693128','HM814048','HT','HT597540','HH162221'
,'HF315054','HTXNY188117','HMSYC151222','HG91078','HS499158','HXQC','HZXNY','HV'
,'JLQC208661','Jeep25501','JH826898','JB','JB765844','JL','JT','JM','JL733653'
,'JLJTQQ30728','JLJTXNY-1157793070','JL10353','JLKC156768','JN798239','JHDZ'
,'KDLK370961','KR627009','KY177022','KLSL45361','KDQQYDDQC374821','KW448128','KNSK242789'
,'KTM691221','KES890561','LKSS','LK530989','LH','LBJN129409','LM246330','LYNKPCO'
,'LBQC650519','LN-1157793070','LF','LF385292','LSLS251386','LH190586','LDDD410869'
,'LTS405740','LN817474','LBQC','LDFZ470198','LP','MZD113716','MJ261150','MSLD347109'
,'MINI587735','MKL802195','NZJ167645','JG','OSQC','OB187787','Polestar','QR704708'
,'QY57393','QC768264','QL175818','QTQC','QDQC60462','RC753521','RW591026','HYDDQC'
,'SKD123646','SL24942','SBL447018','SQDTMAXUS440015','SWMSWQC329718','smart702377'
,'SL305222','SM','SQTJ444878','SFMOTORS','TSL29384','TS718205','WL','WEW419641'
,'WEY274715','WSL826895','WCYZ88224','WLQC688997','WM','WY','XD219739','XFL692945'
,'XTL344693','XPQC14872','XYT','YMQC725696','YQ','YFND590916','YS266781','YWK354038'
,'YDXNY847567','YY422490','YBN144113','YJ562477','YL','ZT523691','ZH363740','ZOBCFC413342'
,'ZX548503','ZD688585','ZTKC702887','ZJKES518822','ZN758314']
url = "http://car.qichedaquan.com/carmaster/"
f = open('/Users/huwenhao02/workspace/car_tag_0822', 'w')
for short in short_names:
searchurl = url + short
req = urllib2.Request(searchurl)
resp = urllib2.urlopen(req)
respHtml = resp.read()
resp.close()
soup = BeautifulSoup(respHtml,"lxml")
ms = soup.findAll("a",{'class':"brand_title"})
out = []
for m in ms:
car_tag = m.get_text().replace('\r','').replace('\n','').replace(' ','')
out.append(car_tag)

out_car = set(out)
for car in out_car:
f.write(short + "#" + car.encode('utf-8') + '\n')

f.close()
'''
url = "http://car.qichedaquan.com/carmaster/KR627009"
f = open('/Users/huwenhao02/workspace/car_tag_0822', 'w')
req = urllib2.Request(url)
resp = urllib2.urlopen(req)
respHtml = resp.read()
resp.close()
soup = BeautifulSoup(respHtml,"lxml")
ms = soup.findAll("a",{'class':"brand_title"})
out = []
for m in ms:
car_tag = m.get_text().replace('\r','').replace('\n','').replace(' ','')
out.append(car_tag)

out_car = set(out)
for car in out_car:
print car

腾讯动漫

道理是一样的,还是需要观察html的样式,以及自己所要的信息。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import urllib2
from bs4 import BeautifulSoup
import time

total = 27394
page = 1
while page<=978:
url = "http://ac.qq.com/Comic/all/search/time/page/{}".format(page)
req = urllib2.Request(url)
resp = urllib2.urlopen(req)
respHtml = resp.read()
resp.close()
soup = BeautifulSoup(respHtml,"lxml")
ms = soup.findAll("a",{'class':"mod-cover-list-thumb mod-cover-effect ui-db"})
f = open('/Users/huwenhao02/workspace/comic_tag_0904', 'a+')
for m in ms:
print m['title']
f.write(m['title'].encode('utf-8') +'\n')
f.close()
print 'write offset=',page
page += 1
time.sleep(10)

猫眼

猫眼是电影的网站,比较全,有最新/热映/经典等频道,但是也有限制,应该是反爬逻辑限制的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import urllib2
from bs4 import BeautifulSoup
import time
import random

page = 100
offset = 3000
user_agents = [
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
]

while page<=122:
headers = {}
headers['User-Agent']=random.choice(user_agents)
url = "http://maoyan.com/films?showType=3&yearId=4&offset={}".format(offset)
req = urllib2.Request(url)
resp = urllib2.urlopen(req)
respHtml = resp.read()
resp.close()
soup = BeautifulSoup(respHtml,"lxml")
ms = soup.findAll("div",{'class':"channel-detail movie-item-title"})
f = open('/Users/huwenhao02/workspace/film_tag_0904', 'a+')
for m in ms:
print m['title']
f.write(m['title'].encode('utf-8') +'\n')
f.close()
print 'write offset=',offset
offset += 30
page += 1
time.sleep(1)

365电视剧网

同样看html的源码,只是频道得不断更换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import urllib2
from bs4 import BeautifulSoup
import time
import random
page = 2
while page<=5:

#url = "http://www.yue365.com/tv/gangju/"
#url = "http://www.yue365.com/tv/gangju/index_2.shtml"
#url = "http://www.yue365.com/tv/hanju/"
#url = "http://www.yue365.com/tv/hanju/index_{}.shtml".format(page)
#url = "http://www.yue365.com/tv/neidi/"
#url = "http://www.yue365.com/tv/neidi/index_{}.shtml".format(page)
#url = "http://www.yue365.com/tv/list/wangluoju/"
url = "http://www.yue365.com/tv/list/wangluoju/index_{}.shtml".format(page)
req = urllib2.Request(url)
resp = urllib2.urlopen(req)
respHtml = resp.read()
resp.close()
soup = BeautifulSoup(respHtml,"lxml")
ms = soup.findAll("div",{'class':"mv_name"})
f = open('/Users/huwenhao02/workspace/tvname_tag_0904', 'a+')
for m in ms:
print m.a['title']
f.write(m.a['title'].encode('utf-8') +'\n')
f.close()
print 'write offset=',page
page += 1
time.sleep(1)

六一儿童网

有些网站html中不含页面上的信息,而是在检查元素中,这就需要另外的插件来完成爬取。selenium,可以搜索下载,并且进行chromedriver的下载和配置。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

import time
from selenium import webdriver
import os

chromedriver = "/usr/local/bin/chromedriver"
os.environ['webdriver.chrome.driver'] = chromedriver

page = 1
while page <= 22:
driver = webdriver.Chrome(chromedriver)
url = "http://movie.61ertong.com/list.html#type:1,letter:,country:,age:,serial:,order:3,video_type:,category:,subject:,grade:,page:{}".format(page)
driver.get(url)
#tags = driver.find_element_by_css_selector("div[@class='new_list']")
tags = driver.find_element_by_id('J_result').find_elements_by_tag_name('p')
for tag in tags:
print tag.text
driver.quit()
page += 1