python 抓取豆瓣影视数据

jupiter
2022-04-17 / 0 评论 / 612 阅读 / 正在检测是否收录...
温馨提示:
本文最后更新于2022年04月17日,已超过712天没有更新,若内容或图片失效,请留言反馈。

python 抓取豆瓣影视数据

1.代码

import re
douban_id = 6965622
import requests
from bs4 import BeautifulSoup
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
url = "https://movie.douban.com/subject/%s/" % douban_id
html = requests.get(url=url,headers=headers)

soup = BeautifulSoup(html.text)
data = soup.find_all("script",{"type":"application/ld+json"})[0].string
data = data.replace("\n"," ")
data =eval(data)

data_handle = {}
data_handle["name"] = data["name"]
data_handle["image"] = data["image"]
director_str = ""
for director in data["director"]:
    director_str += director["name"].split(" ")[0]+";"
data_handle["director"] = director_str

actor_str = ""
for actor in data["actor"]:
    actor_str += actor["name"].split(" ")[0]+";"
data_handle["actor"] = actor_str

author_str = ""
for author in data["author"]:
    author_str += author["name"].split(" ")[0]+";"
data_handle["author"] = author_str

genre_str = ""
for genre in data["genre"]:
    genre_str += genre+";"
data_handle["genre"] = genre_str

data_handle["datePublished"] = data["datePublished"]
data_handle["year"] = data["datePublished"].split("-")[0]

data_handle["avg_score"] = data["aggregateRating"]["ratingValue"]

data_handle["description"] = data["description"]

if re.findall("<span class=\"pl\">又名:</span>(.*?)<br/>",html.text):
    data_handle["sub"] = re.findall("<span class=\"pl\">又名:</span>(.*?)<br/>",html.text)[0]

if re.findall("<span class=\"pl\">语言:</span>(.*?)<br/>",html.text):
    data_handle["lang"] = re.findall("<span class=\"pl\">语言:</span>(.*?)<br/>",html.text)[0]

if re.findall("<span class=\"pl\">集数:</span>(.*?)<br/>",html.text):
    data_handle["total"] = re.findall("<span class=\"pl\">集数:</span>(.*?)<br/>",html.text)[0]

if re.findall("<span class=\"pl\">制片国家/地区:</span>(.*?)<br/>",html.text):
    data_handle["area"] = re.findall("<span class=\"pl\">制片国家/地区:</span>(.*?)<br/>",html.text)[0]

print(data_handle)

2.执行结果

{
    'name': '悬崖',
    'image': '/usr/uploads/auto_save_image/35191c14b33cceb1e4d4d49bb49781c8.jpg',
    'director': '刘进;',
    'actor': '张嘉益;宋佳;程煜;李洪涛;咏梅;姬他;孙浩;徐程;林源;林龙麒;马丽;杨一威;封柏;刘宸希;涩谷天马;林千雯;张东升;孙鹏;施琅;钱漪;王兴君;宋家腾;张瀚文;',
    'author': '全勇先;',
    'genre': '剧情;历史;战争;悬疑;',
    'datePublished': '2012-01-01',
    'year': '2012',
    'avg_score': '8.5',
    'description': '上世纪30年代末,古老的中华大地正经受着最为苦难的时刻。外有日寇铁蹄进犯,内有不同派别势力的斗争碾压,战火连绵,生灵涂炭。为了获取重要的情报,共产党方面派出周乙(张嘉译 饰)和顾秋妍(宋佳 饰)假扮夫...',
    'sub': ' The Brink',
    'lang': ' 汉语普通话',
    'total': ' 40',
    'area': ' 中国大陆'
}
0

评论 (0)

打卡
取消