guoguo12写了一个非常方便的billboard.com解析器。然而,我想添加一个功能,除了包提供的所有其他非常有用的信息之外,它还返回歌词。然而,我是一个菜鸟,不知道我在做什么。
我相信这是billboard的html中的一段代码,它可以访问歌词
<a class="chart-element__information__lyrics show-more__hidden-el" title="Read Lyrics of Blinding Lights by The Weeknd" href="https://www.billboard.com/articles/news/lyrics/8545919/the-weeknd-blinding-lights-lyrics" target="_blank">Song Lyrics</a>
下面是billboard.py的python代码
#!/usr/bin/env python
import datetime
import json
import re
import sys
from bs4 import BeautifulSoup
import requests
"""billboard.py: Unofficial Python API for accessing music charts from Billboard.com."""
__author__ = "Allen Guo"
__license__ = "MIT"
__maintainer__ = "Allen Guo"
__email__ = "guoguo12@gmail.com"
# css selector constants
_CHART_NAME_SELECTOR = 'meta[name="title"]'
_DATE_ELEMENT_SELECTOR = "button.chart-detail-header__date-selector-button"
_PREVIOUS_DATE_SELECTOR = "span.fa-chevron-left"
_NEXT_DATE_SELECTOR = "span.fa-chevron-right"
_ENTRY_LIST_SELECTOR = "div.chart-list-item"
_ENTRY_TITLE_ATTR = "data-title"
_ENTRY_ARTIST_ATTR = "data-artist"
_ENTRY_IMAGE_SELECTOR = "img.chart-list-item__image"
_ENTRY_RANK_ATTR = "data-rank"
# constants for the getMinistatsCellValue helper function
_MINISTATS_CELL = "div.chart-list-item__ministats-cell"
_MINISTATS_CELL_HEADING = "span.chart-list-item__ministats-cell-heading"
class BillboardNotFoundException(Exception):
pass
class BillboardParseException(Exception):
pass
class ChartEntry:
"""Represents an entry (typically a single track) on a chart.
Attributes:
title: The title of the track.
artist: The name of the track artist, as formatted on Billboard.com.
If there are multiple artists and/or featured artists, they will
be included in this string.
image: The URL of the image for the track.
peakPos: The track's peak position on the chart as of the chart date,
as an int (or None if the chart does not include this information).
lastPos: The track's position on the previous week's chart, as an int
(or None if the chart does not include this information).
This value is 0 if the track was not on the previous week's chart.
weeks: The number of weeks the track has been or was on the chart,
including future dates (up until the present time).
rank: The track's position on the chart, as an int.
isNew: Whether the track is new to the chart, as a boolean.
"""
def __init__(self, title, artist, image, peakPos, lastPos, weeks, rank, isNew):
self.title = title
self.artist = artist
self.image = image
self.peakPos = peakPos
self.lastPos = lastPos
self.weeks = weeks
self.rank = rank
self.isNew = isNew
def __repr__(self):
return "{}.{}(title={!r}, artist={!r})".format(
self.__class__.__module__, self.__class__.__name__, self.title, self.artist
)
def __str__(self):
"""Returns a string of the form 'TITLE by ARTIST'.
"""
if self.title:
s = u"'%s' by %s" % (self.title, self.artist)
else:
s = u"%s" % self.artist
if sys.version_info.major < 3:
return s.encode(getattr(sys.stdout, "encoding", "") or "utf8")
else:
return s
def json(self):
"""Returns the entry as a JSON string.
This is useful for caching.
"""
return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
class ChartData:
"""Represents a particular Billboard chart for a particular date.
Attributes:
name: The chart name, as a string.
title: The human-readable chart name, as a string.
date: The date of the chart.
previousDate: The date of the previous chart, as a string in YYYY-MM-DD
format, or None if this information was not available.
entries: A list of ChartEntry objects, ordered by position on the chart
(highest first).
"""
def __init__(self, name, date=None, fetch=True, max_retries=5, timeout=25):
"""Constructs a new ChartData instance.
Args:
name: The chart name, e.g. 'hot-100' or 'pop-songs'.
date: The chart date, as a string in YYYY-MM-DD format.
By default, the latest chart is fetched.
If the argument is not a date on which a chart was published,
Billboard automatically rounds dates up to the nearest date on
which a chart was published.
If this argument is invalid, no exception will be raised;
instead, the chart will contain no entries.
fetch: A boolean indicating whether to fetch the chart data from
Billboard.com immediately (at instantiation time).
If False, the chart data can be populated at a later time
using the fetchEntries() method.
max_retries: The max number of times to retry when requesting data
(default: 5).
timeout: The number of seconds to wait for a server response.
If None, no timeout is applied.
"""
self.name = name
if date is not None:
if not re.match("\d{4}-\d{2}-\d{2}", str(date)):
raise ValueError("Date argument is not in YYYY-MM-DD format")
try:
datetime.datetime(*(int(x) for x in str(date).split("-")))
except:
raise ValueError("Date argument is invalid")
self.date = date
self.title = ""
self.previousDate = None
self._max_retries = max_retries
self._timeout = timeout
self.entries = []
if fetch:
self.fetchEntries()
def __repr__(self):
return "{}.{}({!r}, date={!r})".format(
self.__class__.__module__, self.__class__.__name__, self.name, self.date
)
def __str__(self):
"""Returns the chart as a human-readable string (typically multi-line).
"""
if not self.date:
s = "%s chart (current)" % self.name
else:
s = "%s chart from %s" % (self.name, self.date)
s += "\n" + "-" * len(s)
for n, entry in enumerate(self.entries):
s += "\n%s. %s" % (entry.rank, str(entry))
return s
def __getitem__(self, key):
"""Returns the (key + 1)-th chart entry; i.e., chart[0] refers to the
top entry on the chart.
"""
return self.entries[key]
def __len__(self):
"""Returns the number of entries in the chart.
A length of zero may indicated a failed/bad request.
"""
return len(self.entries)
def json(self):
"""Returns the entry as a JSON string.
This is useful for caching.
"""
return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
def _parseOldStylePage(self, soup):
dateElement = soup.select_one(_DATE_ELEMENT_SELECTOR)
if dateElement:
dateText = dateElement.text.strip()
curDate = datetime.datetime.strptime(dateText, "%B %d, %Y")
self.date = curDate.strftime("%Y-%m-%d")
prevWeek = soup.select_one(_PREVIOUS_DATE_SELECTOR)
nextWeek = soup.select_one(_NEXT_DATE_SELECTOR)
if prevWeek and prevWeek.parent.get("href"):
self.previousDate = prevWeek.parent.get("href").split("/")[-1]
if nextWeek and nextWeek.parent.get("href"):
self.nextDate = nextWeek.parent.get("href").split("/")[-1]
for entrySoup in soup.select(_ENTRY_LIST_SELECTOR):
try:
title = entrySoup[_ENTRY_TITLE_ATTR].strip()
except:
message = "Failed to parse title"
raise BillboardParseException(message)
try:
artist = entrySoup[_ENTRY_ARTIST_ATTR].strip() or ""
except:
message = "Failed to parse artist"
raise BillboardParseException(message)
if artist == "":
title, artist = artist, title
try:
imageSoup = entrySoup.select_one(_ENTRY_IMAGE_SELECTOR)
if imageSoup.has_attr("data-src"):
image = imageSoup["data-src"]
else:
image = imageSoup["src"]
except:
message = "Failed to parse image"
raise BillboardParseException(message)
try:
rank = int(entrySoup[_ENTRY_RANK_ATTR].strip())
except:
message = "Failed to parse rank"
raise BillboardParseException(message)
if self.date:
# "Ministats" is the name in the Billboard.com source code for
# the stats under each chart entry
def getMinistatsCellValue(fieldName, ifNoValue=None):
try:
for ministat in entrySoup.select(_MINISTATS_CELL):
heading = ministat.select_one(_MINISTATS_CELL_HEADING)
headingText = heading.string.strip().lower()
if headingText == fieldName:
value = ministat.text.split(u"\xa0")[0].strip()
if value is None or value == "-":
return ifNoValue
else:
return int(value)
return ifNoValue
except Exception as e:
print(e)
message = "Failed to parse ministats cell value: %s" % fieldName
raise BillboardParseException(message)
peakPos = getMinistatsCellValue("peak")
lastPos = getMinistatsCellValue("last", ifNoValue=0)
weeks = getMinistatsCellValue("weeks", ifNoValue=1)
isNew = True if weeks == 1 else False
else:
peakPos = lastPos = weeks = None
isNew = False
entry = ChartEntry(
title, artist, image, peakPos, lastPos, weeks, rank, isNew
)
self.entries.append(entry)
def _parseNewStylePage(self, soup):
dateElement = soup.select_one("button.date-selector__button.button--link")
if dateElement:
dateText = dateElement.text.strip()
curDate = datetime.datetime.strptime(dateText, "%B %d, %Y")
self.date = curDate.strftime("%Y-%m-%d")
self.previousDate = soup.select_one("#charts")["data-previous-chart-date"]
self.nextDate = soup.select_one("#charts")["data-chart-next-date"]
for entrySoup in soup.select("li.chart-list__element"):
def getEntryAttr(selector):
return entrySoup.select_one(selector).text.strip()
try:
title = getEntryAttr("span.chart-element__information__song")
except:
message = "Failed to parse title"
raise BillboardParseException(message)
try:
artist = getEntryAttr("span.chart-element__information__artist") or ""
except:
message = "Failed to parse artist"
raise BillboardParseException(message)
if artist == "":
title, artist = artist, title
# TODO: Parse the image
image = None
try:
rank = int(getEntryAttr("span.chart-element__rank__number"))
except:
message = "Failed to parse rank"
raise BillboardParseException(message)
def getMeta(attribute, ifNoValue=None):
try:
selected = entrySoup.select_one(
"span.chart-element__meta.text--%s" % attribute
)
if (
not selected
or selected.string is None
or selected.string == "-"
):
return ifNoValue
else:
return int(selected.string.strip())
except:
message = "Failed to parse metadata value: %s" % attribute
raise BillboardParseException(message)
if self.date:
peakPos = getMeta("peak")
lastPos = getMeta("last", ifNoValue=0)
weeks = getMeta("week", ifNoValue=1)
isNew = True if weeks == 1 else False
else:
peakPos = lastPos = weeks = None
isNew = False
entry = ChartEntry(
title, artist, image, peakPos, lastPos, weeks, rank, isNew
)
self.entries.append(entry)
def _parsePage(self, soup):
chartTitleElement = soup.select_one(_CHART_NAME_SELECTOR)
if chartTitleElement:
self.title = re.sub(
" Chart$",
"",
chartTitleElement.get("content", "").split("|")[0].strip(),
)
if soup.select("table"):
self._parseOldStylePage(soup)
else:
self._parseNewStylePage(soup)
def fetchEntries(self):
"""GETs the corresponding chart data from Billboard.com, then parses
the data using BeautifulSoup.
"""
if not self.date:
# Fetch latest chart
url = "https://www.billboard.com/charts/%s" % (self.name)
else:
url = "https://www.billboard.com/charts/%s/%s" % (self.name, self.date)
session = _get_session_with_retries(max_retries=self._max_retries)
req = session.get(url, timeout=self._timeout)
if req.status_code == 404:
message = "Chart not found (perhaps the name is misspelled?)"
raise BillboardNotFoundException(message)
req.raise_for_status()
soup = BeautifulSoup(req.text, "html.parser")
self._parsePage(soup)
def charts():
"""Gets a list of all Billboard charts from Billboard.com.
"""
session = _get_session_with_retries(max_retries=5)
req = session.get("https://www.billboard.com/charts", timeout=25)
req.raise_for_status()
soup = BeautifulSoup(req.text, "html.parser")
return [
link["href"].split("/")[-1]
for link in soup.findAll("a", {"class": "chart-panel__link"})
]
def _get_session_with_retries(max_retries):
session = requests.Session()
session.mount(
"https://www.billboard.com",
requests.adapters.HTTPAdapter(max_retries=max_retries),
)
return session
guoguo12告诉我我需要这样做
“哦,明白了。你需要为a元素写一个选择器,抓取URL,下载那个页面,然后解析它。”
然而,我不确定上面的话是什么意思(我有提到我是菜鸟吗?)。
我的问题是,我应该在哪里添加选择器标记?我想我可以添加一些东西,比如
_chart-element__information__lyrics show-more__hidden-el" title="
在……下面
# css selector constants
但这看起来不太对。
发布于 2020-05-26 11:58:36
所以我不确定这是否具体地回答了上面的问题,但是它确实解决了我想要解决的问题。As mentioned here,by Shinyhero36;您可以使用以下代码从公告牌返回歌词:
import requests
from bs4 import BeautifulSoup as Parse
def make_soup(url):
"""
Parse a web page info html
"""
user_agent = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
r = requests.get(url, headers=user_agent)
html = Parse(r.content, "html.parser")
return html
def format_url(string):
"""
Replace les spaces with '%20'
"""
return string.replace(" ", "%20")
def get_song_url(html):
song_url = html.find("a", {"class": "title"})["href"]
return song_url
def find_Lyrics(titre, artiste):
url = f"https://www.musixmatch.com/fr/search/{artiste}%20{titre}/tracks"
url = format_url(url)
pageweb = make_soup(url)
# Recupere le lien de la chanson
song_url = pageweb.find("a", {"class": "title"})["href"]
song_url = "https://www.musixmatch.com" + song_url
# Recupere les paroles
pageweb = make_soup(song_url)
paroles = list()
for span in pageweb.find_all("span", {"class" : "lyrics__content__ok"}):
print(span.text)
find_Lyrics("title","artist")
将标题和艺术家替换为要搜索的歌曲的标题和艺术家。
https://stackoverflow.com/questions/61991323
复制相似问题