Python爬取新闻网站的标题和链接存入Excel

最近爬取的一个新闻网站的标题和新闻页的链接还有发布时间。用到了BS4和re,在对要进行爬取的网页url处理时,我选择的是放入Quene中,调用。其实放入set()或者存为txt都可以。 正则用的不是太66,所以正则部分显得有点牵强。 数据存储选择为Excel,可以存为MySQL的,代码还没写,稍后会添加进来。代码还有不足之处,在做修改。 
# coding:utf-8 import requests import random import re from Queue import Queue from bs4 import BeautifulSoup from xlwt import * import sys reload(sys) sys.setdefaultencoding("utf-8") class spider_web_news(): def __init__(self): # 请求头 user_agent = [ 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36', "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36" ] self.headers = {"User_agent": random.choice(user_agent)} self.url = Queue() # 将获取到的data存入Quene()中调用 # 进入首页,获取专题页链接 def send_req(self,url): response = requests.get(url,headers=self.headers) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") # 转化为Beautifulsoup对象 # print soup.prettify() # 打印 for base_tag in soup.find_all(class_="menuli"): base_url = re.findall('href="(.*?)"',str(base_tag))[0] # print base_url # 输出各专题链接 self.url.put(base_url) # 将专题链接放入Quene中 # 进入各专题页,爬取标题title (首页title) def send_url(self): while not self.url.empty(): base_url = self.url.get() response = requests.get(base_url, headers=self.headers) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "lxml") for title in soup.find_all(class_="langmubt2"): # print "title is ---------",title title = re.findall('<li><span><time class="time" datetime="(.*?)">(.*?)</time></span><a href="(.*?)" style="" target="_blank">(.*?)</a></li>', str(title)) for mubiao in title: print mubiao[0],mubiao[2],mubiao[3] yield mubiao[0],mubiao[2],mubiao[3] # 将数据存入Excel中 def save_title(self): print "111" num = 1 title_file = Workbook(encoding="utf-8") # 存为Excel,以 utf-8 编码打开 table = title_file.add_sheet('data') # 打开“data” 工作薄 # 存入表头 table.write(0,0,"time") table.write(0,1,"url") table.write(0,2,"title") all_data = self.send_url() # 调用send_url()中 yield 的返回值 for data in all_data: # data 数据为元组类型 table.write(num,0,data[0]) table.write(num,1,data[1]) table.write(num,2,data[2]) print "saving done..." num += 1 title_file.save("002.xls") # 保存文件 if __name__ == '__main__': url = "http://hsb.hspress.net/" A = spider_web_news() A.send_req(url) # A.send_url() A.save_title() 

原文链接:https://blog.csdn.net/weixin_43857152/article/details/85098893?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522165277607816782390599918%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fblog.%2522%257D&request_id=165277607816782390599918&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~blog~first_rank_ecpm_v1~times_rank-18-85098893-null-null.nonecase&utm_term=%E6%96%B0%E9%97%BB

© 版权声明
THE END
喜欢就支持一下吧
点赞0 分享
评论 抢沙发
头像
文明发言,共建和谐米科社区
提交
头像

昵称

取消
昵称表情图片

    暂无评论内容