carlibre抓取网页内容生成电子书
#!/usr/bin/python # encoding: utf-8 from calibre.web.feeds.recipes import BasicNewsRecipe class Pro_Git_Chinese(BasicNewsRecipe): title = 'QCustomplot' description = 'QCustomplot介绍' cover_url = '' _author_ = '朝十晚八' url_pre = 'https://www.cnblogs.com/swarmbees/category/908110.html' no_stylesheets = True #去除css keep_only_tags = [{ 'class': 'blogpost-body' }] #仅在blogpost-body里面查找 simultaneous_downloads = 1 #最大下载线程,默认为5 def parse_index(self): # recipe的核心method,通过分析目录页,找到各页面链接,并抓取内容,返回一个较复杂的数据结构 soup = self.index_to_soup(self.url_pre)#目录页 #查找div,其class属性为entrylist,因为列表是处于<div class="entrylist">容器中 div = soup.find('div', {'class': 'entrylist'})#目录页的寻找范围 articles = [] for link in div.findAll('a', id=True):#循环查找标签a,且其id要为真。 til = link.contents[0].strip() #获取标题,去除空格 url = link['href'] #获取标题的链接 a = { 'title': til, 'url': url } articles.insert(0,a) #append(a)是在列表末尾追加,现在改成insert(0,a)在列表前面插入 results = [(self.title, articles)] #结果由标题和文章组成 return results
--------------------------------------------------------------------------------------------------------------------
#!/usr/bin/python # encoding: utf-8 from calibre.web.feeds.recipes import BasicNewsRecipe class Pro_Git_Chinese(BasicNewsRecipe): title = 'QT学习之路2' description = '' __author__ = '豆子' cover_url = '' simultaneous_downloads = 5 url_pre = 'https://www.devbean.net/2012/08/qt-study-road-2-catelog/' no_stylesheets = True remove_javascript = True keep_only_tags = [{ 'class': 'thecontent clearfix' }] def parse_index(self): # recipe的核心method,通过分析目录页,找到各页面链接,并抓取内容,返回一个较复杂的数据结构 soup = self.index_to_soup(self.url_pre)#目录页 div = soup.find('ol')#目录页的寻找范围 articles = [] for link in div.findAll('a'): til = link.contents[0].strip() url = link['href'] a = { 'title': til, 'url': url} articles.append(a) results = [(self.title, articles)] return results