[www.cc148.com]小说爬虫

from bs4 import BeautifulSoup
from urllib.request import urlopen
from progressbar import *

#根据目录页面地址获取目录页面的html、并封装为soup
catalogue_url = "https://www.cc148.com/19_19568/"
html = urlopen(catalogue_url).read().decode('gbk')
soup = BeautifulSoup(html, features='lxml')

#根据目录页html获取到小说的titile
title = soup.title.text.split("最新章节列表")[0]

#根据目录页html获取到小说每一章的地址集合chapter_url_list
chapter_url_list = []
for dd in soup.find_all('dd'):
    chapter_url  = catalogue_url + dd.a["href"]
    chapter_url_list.append(chapter_url)

# 加入进度条功能
widgets = ['正在下载--'+title,Percentage(), ' ', Bar('#'),' ', Timer(),' ', ETA()]
pbar = ProgressBar(widgets=widgets, maxval=len(chapter_url_list)).start()
count = 0

#根据chapter_url_list逐章下载并保存到当前文件夹
txt_file_name = title + ".txt"
with open(txt_file_name,'w') as f:
    for chapter_url in chapter_url_list:
        chapter_html = urlopen(chapter_url).read().decode('gbk')
        chapter_soup = BeautifulSoup(chapter_html, features='lxml')
        
        chapter_title = chapter_soup.title.text.split('_')[0]
        chapter_content = chapter_soup.find("div",{"id":"content"}).text
        
        chapter_text = chapter_title + "\n" + chapter_content + "\n"
        f.write(chapter_text)
        
        count +=1
        pbar.update(count)

pbar.finish()

[www.cc148.com]小说爬虫

[www.cc148.com]小说爬虫

评论 (0)