03 获取电影详情页信息
- def get_message(url):
- """
- 获取电影详情页里的信息
- """
- time.sleep(10)
- data = {}
- host = """refer: http://maoyan.com/news
- """
- header = head + host
- headers = str_to_dict(header)
- response = requests.get(url=url, headers=headers)
- u = response.text
- # 破解猫眼文字反爬
- (mao_num_list, utf8last) = get_numbers(u)
- # 获取电影信息
- soup = BeautifulSoup(u, "html.parser")
- mw = soup.find_all('span', {'class': 'stonefont'})
- score = soup.find_all('span', {'class': 'score-num'})
- unit = soup.find_all('span', {'class': 'unit'})
- ell = soup.find_all('li', {'class': 'ellipsis'})
- name = soup.find_all('h3', {'class': 'name'})
- # 返回电影信息
- data["name"] = name[0].get_text()
- data["type"] = ell[0].get_text()
- data["country"] = ell[1].get_text().split('/')[0].strip().replace('n', '')
- data["length"] = ell[1].get_text().split('/')[1].strip().replace('n', '')
- data["released"] = ell[2].get_text()[:10]
- # 因为会出现没有票房的电影,所以这里需要判断
- if unit:
- bom = ['分', score[0].get_text().replace('.', '').replace('万', ''), unit[0].get_text()]
- for i in range(len(mw)):
- moviewish = mw[i].get_text().encode('utf-8')
- moviewish = str(moviewish, encoding='utf-8')
- # 通过比对获取反爬文字信息
- for j in range(len(utf8last)):
- moviewish = moviewish.replace(utf8last[j], maoyan_num_list[j])
- if i == 0:
- data["score"] = moviewish + bom[i]
- elif i == 1:
- if '万' in moviewish:
- data["people"] = int(float(moviewish.replace('万', '')) * 10000)
- else:
- data["people"] = int(float(moviewish))
- else:
- if '万' == bom[i]:
- data["box_office"] = int(float(moviewish) * 10000)
- else:
- data["box_office"] = int(float(moviewish) * 100000000)
- else:
- bom = ['分', score[0].get_text().replace('.', '').replace('万', ''), 0]
- for i in range(len(mw)):
- moviewish = mw[i].get_text().encode('utf-8')
- moviewish = str(moviewish, encoding='utf-8')
- for j in range(len(utf8last)):
- moviewish = moviewish.replace(utf8last[j], maoyan_num_list[j])
- if i == 0:
- data["score"] = moviewish + bom[i]
- else:
- if '万' in moviewish:
- data["people"] = int(float(moviewish.replace('万', '')) * 10000)
- else:
- data["people"] = int(float(moviewish))
- data["box_office"] = bom[2]
- yield data
四、数据存储 (编辑:济南站长网)
【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!
|