Python爬虫代码———新浪新闻

1
2
3
4
5
import requests
res = requests.get('http://news.sina.com.cn/china/')
res.encoding = 'utf-8'
print(type(res))
#print(res.text)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#使用示例
from bs4 import BeautifulSoup
html_sample = ' \
<html> \
<body> \
<h1 id="title">Hello World</h1> \
<a href="#" class="link">This is link1</a> \
<a href="# link2" class="link">This is link2</a> \
</body> \
</html>'

soup = BeautifulSoup(html_sample, 'html.parser')
print(type(soup))
print(soup.text)
1
2
3
4
5
6
#使用select 找出所有含h1的元素
soup = BeautifulSoup(html_sample, 'html.parser')
alink = soup.select('h1')
print(alink)
print(alink[0])
print(alink[0].text)
1
2
3
4
5
6
7
#使用select 找出所有含a的元素
soup = BeautifulSoup(html_sample, 'html.parser')
alink = soup.select('a')
print(alink)
for link in alink:
print(link)
print(link.text)
1
2
3
#使用select 找出所有id为title的元素
alink = soup.select('#title')
print(alink)
1
2
3
4
#使用select 找出所有class为link的元素
alink = soup.select('.link')
for link in alink:
print(link)
1
2
3
4
#使用select 找出所有a tag 的href的连结
alink = soup.select('a')
for link in alink:
print(link['href'])
1
2
3
4
5
import requests
res = requests.get('http://news.sina.com.cn/china/')
res.encoding = 'utf-8'
#print(type(res))
soup = BeautifulSoup(res.text, 'html.parser')
1
2
3
4
5
6
7
for news in soup.select('.news-item'):
#print(news)
if len(news.select('h2')) > 0:
h2 = news.select('h2')[0].text
time = news.select('.time')[0].text
a = news.select('a')[0]['href']
#print(time, h2, a)
1
2
3
4
5
6
import requests
res = requests.get('http://news.sina.com.cn/c/2018-07-31/doc-ihhacrcc9897484.shtml')
res.encoding = 'utf-8'
print(type(res))
#print(res.text)
soup = BeautifulSoup(res.text, 'html.parser')
1
2
3
#标题
title = soup.select('.main-title')[0].text
print(title)
1
2
3
4
5
6
7
8
9
10
#时间与来源
datesource = soup.select('.date-source')[0]
print(datesource)
date = datesource.span.text
print(date)
source = datesource.a.text
print(source)
#或者
#date = soup.select('.date-source')[0].contents[0].strip()
#date
1
2
3
4
5
#时间
from datetime import datetime
dt = datetime.strptime(date, '%Y年%m月%d日 %H:%M') #str转time
print(dt)
dt.strftime('%Y-%M-%d')#time转str
1
2
3
4
5
6
7
#文章内容
article = []
for p in soup.select('#article p')[:-1]:
article.append(p.text.strip())
#print(article)
' '.join(article)
#或者' '.join[p.text.strip() for p in soup.select('#article p')[:-1]]
1
2
3
#编辑
editor = soup.select('.show_author')[0].text.lstrip('责任编辑:')
print(editor)
1
2
3
4
5
6
7
8
9
10
#评论
import requests
comments = requests.get('http://comment5.news.sina.com.cn/page/info?version=1&format=json&\
channel=gn&newsid=comos-hhacrcc9897484&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&\
page_size=3')
#comments.text
import json
jd = json.loads(comments.text)
#print(jd)
commentnum = jd['result']['count']['total']
1
2
3
4
newsurl = 'http://news.sina.com.cn/c/2018-07-31/doc-ihhacrcc9897484.shtml'
newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i')
#或正则式
re.search('doc-i(.*).shtml', newsurl).group(1)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#整理评论
commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1&format=json&\
channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&\
page_size=3'
import re
import json
def getCommentCount(newsurl):
newsid = re.search('doc-i(.*).shtml', newsurl).group(1)
comments = requests.get(commentURL.format(newsid))
jd = json.loads(comments.text)
return jd['result']['count']['total']

#测试
news = 'http://news.sina.com.cn/c/2018-07-31/doc-ihhacrcc9897484.shtml'
getCommentCount(news)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#单个新闻信息
import requests
from bs4 import BeautifulSoup

def getNewsDetail(newsurl):
result = {}
res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
result['title'] = soup.select('.main-title')[0].text
result['newssource'] = soup.select('.date-source')[0].a.text
date = soup.select('.date-source')[0].span.text
result['dt'] = datetime.strptime(date, '%Y年%m月%d日 %H:%M') #str转time
result['article'] = ' '.join([p.text.strip() for p in soup.select('#article p')[:-1]])
result['editor'] = soup.select('.show_author')[0].text.lstrip('责任编辑:')
result['comments'] = getCommentCount(newsurl)
return result

#test
news = 'http://news.sina.com.cn/c/nd/2018-07-24/doc-ihftenhz7547208.shtml'
getNewsDetail(news)
1
2
3
4
5
6
7
8
9
10
#剖析分页信息
import requests
import json
res = requests.get('http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&\
cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&\
format=json&page=6&callback=newsloadercallback&_=1533026268219')
jd = json.loads(res.text.lstrip(' newsloadercallback(').rstrip(');'))
#jd
for ent in jd['result']['data']:
print(ent['url'])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#建立剖析清单链接函式
def parseListLinks(url):
newsdetails = []
res = requests.get(url)
jd = json.loads(res.text.lstrip(' newsloadercallback(').rstrip(');'))
for ent in jd['result']['data']:

try:
print(ent['url'])
newsdetails.append(getNewsDetail(ent['url']))
except AttributeError:
pass

return newsdetails

#test
url = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&\
cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&\
format=json&page=6&callback=newsloadercallback&_=1533026268219'
parseListLinks(url)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#使用for循环产生多页连结&批次抓取每页新闻内文
def createUrl(num):
url = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&\
cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&\
format=json&page={}'#使用{}代替
news_total = []
for i in range(1, num):
newsurl = url.format(i)
print(newsurl)
newsary = parseListLinks(newsurl)
news_total.extend(newsary)
return news_total
#test
news_total = createUrl(3)
#print(news_total)
len(news_total)
1
2
3
4
#pandas 整理资料
import pandas
df = pandas.DataFrame(news_total)
df.head()
1
2
3
4
5
#保存数据
df.to_csv('news.csv')#到csv
import sqlite3 #到sql
with sqlite3.connect('news.sqlite') as db:
df.to_sql('news', con = db)
1
2
3
4
5
#操作sql
import sqlite3
with sqlite3.connect('news.sqlite') as db:
df2 = pandas.read_sql_query('SELECT * FROM news', con = db)
df2.head()