被窝网电视剧爬虫
#抓取电视剧
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib
import re
import requests
import os
from tqdm import tqdm
def download_from_url(url, dst):
"""
@param: url to download file
@param: dst place to put the file
:return: bool
"""
# 获取文件长度
try:
file_size = int(urlopen(url).info().get('Content-Length', -1))
except Exception as e:
print(e)
print("错误,访问url: %s 异常" % url)
return False
# 判断本地文件存在时
if os.path.exists(dst):
# 获取文件大小
first_byte = os.path.getsize(dst)
else:
# 初始大小为0
first_byte = 0
# 判断大小一致,表示本地文件存在
if first_byte >= file_size:
print("文件已经存在,无需下载")
return file_size
header = {"Range": "bytes=%s-%s" % (first_byte, file_size)}
pbar = tqdm(
total=file_size, initial=first_byte,
unit='B', unit_scale=True, desc=url.split('/')[-1])
# 访问url进行下载
req = requests.get(url, headers=header, stream=True)
try:
with(open(dst, 'ab')) as f:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
except Exception as e:
print(e)
return False
pbar.close()
return True
#网站根地址
web_base_url="http://10.1.48.113/"
vedio_episodes_page_url_list=[
"http://10.1.48.113/shipin/dianshijuji/2018-09-29/193.php",
"http://10.1.48.113/shipin/dianshijuji/2018-10-26/242.php",
"http://10.1.48.113/shipin/dianshijuji/2018-10-26/239.php",
"http://10.1.48.113/shipin/dianshijuji/2018-10-26/240.php",
"http://10.1.48.113/shipin/dianshijuji/2018-10-26/238.php",
"http://10.1.48.113/shipin/dianshijuji/2018-09-22/157.php"
]
for vedio_episodes_page_url in vedio_episodes_page_url_list:
#逐部电视剧解析
try:
vedio_episodes_page_html = urlopen(vedio_episodes_page_url).read().decode('utf-8')
vedio_episodes_page_soup = BeautifulSoup(vedio_episodes_page_html, features='lxml')
#解析出电视剧名和设置保存文件夹
vedio_name=vedio_episodes_page_soup.head.find_all("meta")[2]["content"].replace(" ","")
vedio_save_dir="./"+vedio_name
if not os.path.exists(vedio_save_dir):
os.mkdir(vedio_save_dir)
#解析出单集播放页面地址
vedio_episode_href_list=vedio_episodes_page_soup.find_all('a', {"class": "meihua_btn"})
print("[开始下载]:"+vedio_name+"---"+vedio_episodes_page_url)
#逐集解析
count=0
for vedio_episode_href in vedio_episode_href_list:
vedio_episode_url = web_base_url + vedio_episode_href["href"]
vedio_episode_html = urlopen(vedio_episode_url).read().decode('utf-8')
vedio_episode_soup = BeautifulSoup(vedio_episode_html, features='lxml')
count=count+1
vedio_episode_title = "第"+str(count)+"集"
vedio_episode_save_path=vedio_save_dir+"/"+vedio_episode_title+".mp4"
episode_url = web_base_url + re.findall("video:'(.*?)'",vedio_episode_html)[0]
#逐集下载
print("[开始下载]:"+vedio_name+"---"+vedio_episode_title+"---"+episode_url)
download_from_url(episode_url,vedio_episode_save_path)
print("[下载完成]:"+vedio_name+"---"+vedio_episode_title)
except Exception as e:
print(e)
print("错误,解析url: %s 异常" % vedio_episodes_page_url)
评论 (0)