[ie/youtube] Add a PO Token Provider Framework (#12840)

https://github.com/yt-dlp/yt-dlp/tree/master/yt_dlp/extractor/youtube/pot/README.md

Authored by: coletdjnz
This commit is contained in:
coletdjnz
2025-05-18 13:45:26 +12:00
committed by GitHub
parent abf58dcd6a
commit 2685654a37
28 changed files with 4134 additions and 28 deletions

View File

@@ -23,6 +23,8 @@ from ._base import (
_split_innertube_client,
short_client_name,
)
from .pot._director import initialize_pot_director
from .pot.provider import PoTokenContext, PoTokenRequest
from ..openload import PhantomJSwrapper
from ...jsinterp import JSInterpreter
from ...networking.exceptions import HTTPError
@@ -66,6 +68,7 @@ from ...utils import (
urljoin,
variadic,
)
from ...utils.networking import clean_headers, clean_proxies, select_proxy
STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client'
STREAMING_DATA_INITIAL_PO_TOKEN = '__yt_dlp_po_token'
@@ -1809,6 +1812,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
super().__init__(*args, **kwargs)
self._code_cache = {}
self._player_cache = {}
self._pot_director = None
def _real_initialize(self):
super()._real_initialize()
self._pot_director = initialize_pot_director(self)
def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data, is_live):
lock = threading.Lock()
@@ -2855,7 +2863,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
continue
def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, visitor_data=None,
data_sync_id=None, session_index=None, player_url=None, video_id=None, **kwargs):
data_sync_id=None, session_index=None, player_url=None, video_id=None, webpage=None, **kwargs):
"""
Fetch a PO Token for a given client and context. This function will validate required parameters for a given context and client.
@@ -2869,10 +2877,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
@param session_index: session index.
@param player_url: player URL.
@param video_id: video ID.
@param webpage: video webpage.
@param kwargs: Additional arguments to pass down. May be more added in the future.
@return: The fetched PO Token. None if it could not be fetched.
"""
# TODO(future): This validation should be moved into pot framework.
# Some sort of middleware or validation provider perhaps?
# GVS WebPO Token is bound to visitor_data / Visitor ID when logged out.
# Must have visitor_data for it to function.
if player_url and context == _PoTokenContext.GVS and not visitor_data and not self.is_authenticated:
@@ -2894,6 +2906,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
f'Got a GVS PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.'
f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"')
self.write_debug(f'{video_id}: Retrieved a {context.value} PO Token for {client} client from config')
return config_po_token
# Require GVS WebPO Token if logged in for external fetching
@@ -2903,7 +2916,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"')
return
return self._fetch_po_token(
po_token = self._fetch_po_token(
client=client,
context=context.value,
ytcfg=ytcfg,
@@ -2912,11 +2925,66 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
session_index=session_index,
player_url=player_url,
video_id=video_id,
video_webpage=webpage,
**kwargs,
)
if po_token:
self.write_debug(f'{video_id}: Retrieved a {context.value} PO Token for {client} client')
return po_token
def _fetch_po_token(self, client, **kwargs):
"""(Unstable) External PO Token fetch stub"""
context = kwargs.get('context')
# Avoid fetching PO Tokens when not required
fetch_pot_policy = self._configuration_arg('fetch_pot', [''], ie_key=YoutubeIE)[0]
if fetch_pot_policy not in ('never', 'auto', 'always'):
fetch_pot_policy = 'auto'
if (
fetch_pot_policy == 'never'
or (
fetch_pot_policy == 'auto'
and _PoTokenContext(context) not in self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS']
)
):
return None
headers = self.get_param('http_headers').copy()
proxies = self._downloader.proxies.copy()
clean_headers(headers)
clean_proxies(proxies, headers)
innertube_host = self._select_api_hostname(None, default_client=client)
pot_request = PoTokenRequest(
context=PoTokenContext(context),
innertube_context=traverse_obj(kwargs, ('ytcfg', 'INNERTUBE_CONTEXT')),
innertube_host=innertube_host,
internal_client_name=client,
session_index=kwargs.get('session_index'),
player_url=kwargs.get('player_url'),
video_webpage=kwargs.get('video_webpage'),
is_authenticated=self.is_authenticated,
visitor_data=kwargs.get('visitor_data'),
data_sync_id=kwargs.get('data_sync_id'),
video_id=kwargs.get('video_id'),
request_cookiejar=self._downloader.cookiejar,
# All requests that would need to be proxied should be in the
# context of www.youtube.com or the innertube host
request_proxy=(
select_proxy('https://www.youtube.com', proxies)
or select_proxy(f'https://{innertube_host}', proxies)
),
request_headers=headers,
request_timeout=self.get_param('socket_timeout'),
request_verify_tls=not self.get_param('nocheckcertificate'),
request_source_address=self.get_param('source_address'),
bypass_cache=False,
)
return self._pot_director.get_po_token(pot_request)
@staticmethod
def _is_agegated(player_response):
@@ -3074,8 +3142,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'video_id': video_id,
'data_sync_id': data_sync_id if self.is_authenticated else None,
'player_url': player_url if require_js_player else None,
'webpage': webpage,
'session_index': self._extract_session_index(master_ytcfg, player_ytcfg),
'ytcfg': player_ytcfg,
'ytcfg': player_ytcfg or self._get_default_ytcfg(client),
}
player_po_token = self.fetch_po_token(

View File

@@ -0,0 +1,309 @@
# YoutubeIE PO Token Provider Framework
As part of the YouTube extractor, we have a framework for providing PO Tokens programmatically. This can be used by plugins.
Refer to the [PO Token Guide](https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide) for more information on PO Tokens.
> [!TIP]
> If publishing a PO Token Provider plugin to GitHub, add the [yt-dlp-pot-provider](https://github.com/topics/yt-dlp-pot-provider) topic to your repository to help users find it.
## Public APIs
- `yt_dlp.extractor.youtube.pot.cache`
- `yt_dlp.extractor.youtube.pot.provider`
- `yt_dlp.extractor.youtube.pot.utils`
Everything else is internal-only and no guarantees are made about the API stability.
> [!WARNING]
> We will try our best to maintain stability with the public APIs.
> However, due to the nature of extractors and YouTube, we may need to remove or change APIs in the future.
> If you are using these APIs outside yt-dlp plugins, please account for this by importing them safely.
## PO Token Provider
`yt_dlp.extractor.youtube.pot.provider`
```python
from yt_dlp.extractor.youtube.pot.provider import (
PoTokenRequest,
PoTokenContext,
PoTokenProvider,
PoTokenResponse,
PoTokenProviderError,
PoTokenProviderRejectedRequest,
register_provider,
register_preference,
ExternalRequestFeature,
)
from yt_dlp.networking.common import Request
from yt_dlp.extractor.youtube.pot.utils import get_webpo_content_binding
from yt_dlp.utils import traverse_obj
from yt_dlp.networking.exceptions import RequestError
import json
@register_provider
class MyPoTokenProviderPTP(PoTokenProvider): # Provider class name must end with "PTP"
PROVIDER_VERSION = '0.2.1'
# Define a unique display name for the provider
PROVIDER_NAME = 'my-provider'
BUG_REPORT_LOCATION = 'https://issues.example.com/report'
# -- Validation shortcuts. Set these to None to disable. --
# Innertube Client Name.
# For example, "WEB", "ANDROID", "TVHTML5".
# For a list of WebPO client names,
# see yt_dlp.extractor.youtube.pot.utils.WEBPO_CLIENTS.
# Also see yt_dlp.extractor.youtube._base.INNERTUBE_CLIENTS
# for a list of client names currently supported by the YouTube extractor.
_SUPPORTED_CLIENTS = ('WEB', 'TVHTML5')
_SUPPORTED_CONTEXTS = (
PoTokenContext.GVS,
)
# If your provider makes external requests to websites (i.e. to youtube.com)
# using another library or service (i.e., not _request_webpage),
# set the request features that are supported here.
# If only using _request_webpage to make external requests, set this to None.
_SUPPORTED_EXTERNAL_REQUEST_FEATURES = (
ExternalRequestFeature.PROXY_SCHEME_HTTP,
ExternalRequestFeature.SOURCE_ADDRESS,
ExternalRequestFeature.DISABLE_TLS_VERIFICATION
)
def is_available(self) -> bool:
"""
Check if the provider is available (e.g. all required dependencies are available)
This is used to determine if the provider should be used and to provide debug information.
IMPORTANT: This method SHOULD NOT make any network requests or perform any expensive operations.
Since this is called multiple times, we recommend caching the result.
"""
return True
def close(self):
# Optional close hook, called when YoutubeDL is closed.
pass
def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse:
# If you need to validate the request before making the request to the external source.
# Raise yt_dlp.extractor.youtube.pot.provider.PoTokenProviderRejectedRequest if the request is not supported.
if request.is_authenticated:
raise PoTokenProviderRejectedRequest(
'This provider does not support authenticated requests'
)
# Settings are pulled from extractor args passed to yt-dlp with the key `youtubepot-<PROVIDER_KEY>`.
# For this example, the extractor arg would be:
# `--extractor-args "youtubepot-mypotokenprovider:url=https://custom.example.com/get_pot"`
external_provider_url = self._configuration_arg(
'url', default=['https://provider.example.com/get_pot'])[0]
# See below for logging guidelines
self.logger.trace(f'Using external provider URL: {external_provider_url}')
# You should use the internal HTTP client to make requests where possible,
# as it will handle cookies and other networking settings passed to yt-dlp.
try:
# See docstring in _request_webpage method for request tips
response = self._request_webpage(
Request(external_provider_url, data=json.dumps({
'content_binding': get_webpo_content_binding(request),
'proxy': request.request_proxy,
'headers': request.request_headers,
'source_address': request.request_source_address,
'verify_tls': request.request_verify_tls,
# Important: If your provider has its own caching, please respect `bypass_cache`.
# This may be used in the future to request a fresh PO Token if required.
'do_not_cache': request.bypass_cache,
}).encode(), proxies={'all': None}),
pot_request=request,
note=(
f'Requesting {request.context.value} PO Token '
f'for {request.internal_client_name} client from external provider'),
)
except RequestError as e:
# If there is an error, raise PoTokenProviderError.
# You can specify whether it is expected or not. If it is unexpected,
# the log will include a link to the bug report location (BUG_REPORT_LOCATION).
raise PoTokenProviderError(
'Networking error while fetching to get PO Token from external provider',
expected=True
) from e
# Note: PO Token is expected to be base64url encoded
po_token = traverse_obj(response, 'po_token')
if not po_token:
raise PoTokenProviderError(
'Bad PO Token Response from external provider',
expected=False
)
return PoTokenResponse(
po_token=po_token,
# Optional, add a custom expiration timestamp for the token. Use for caching.
# By default, yt-dlp will use the default ttl from a registered cache spec (see below)
# Set to 0 or -1 to not cache this response.
expires_at=None,
)
# If there are multiple PO Token Providers that can handle the same PoTokenRequest,
# you can define a preference function to increase/decrease the priority of providers.
@register_preference(MyPoTokenProviderPTP)
def my_provider_preference(provider: PoTokenProvider, request: PoTokenRequest) -> int:
return 50
```
## Logging Guidelines
- Use the `self.logger` object to log messages.
- When making HTTP requests or any other expensive operation, use `self.logger.info` to log a message to standard non-verbose output.
- This lets users know what is happening when a time-expensive operation is taking place.
- It is recommended to include the PO Token context and internal client name in the message if possible.
- For example, `self.logger.info(f'Requesting {request.context.value} PO Token for {request.internal_client_name} client from external provider')`.
- Use `self.logger.debug` to log a message to the verbose output (`--verbose`).
- For debugging information visible to users posting verbose logs.
- Try to not log too much, prefer using trace logging for detailed debug messages.
- Use `self.logger.trace` to log a message to the PO Token debug output (`--extractor-args "youtube:pot_trace=true"`).
- Log as much as you like here as needed for debugging your provider.
- Avoid logging PO Tokens or any sensitive information to debug or info output.
## Debugging
- Use `-v --extractor-args "youtube:pot_trace=true"` to enable PO Token debug output.
## Caching
> [!WARNING]
> The following describes more advance features that most users/developers will not need to use.
> [!IMPORTANT]
> yt-dlp currently has a built-in LRU Memory Cache Provider and a cache spec provider for WebPO Tokens.
> You should only need to implement cache providers if you want an external cache, or a cache spec if you are handling non-WebPO Tokens.
### Cache Providers
`yt_dlp.extractor.youtube.pot.cache`
```python
from yt_dlp.extractor.youtube.pot.cache import (
PoTokenCacheProvider,
register_preference,
register_provider
)
from yt_dlp.extractor.youtube.pot.provider import PoTokenRequest
@register_provider
class MyCacheProviderPCP(PoTokenCacheProvider): # Provider class name must end with "PCP"
PROVIDER_VERSION = '0.1.0'
# Define a unique display name for the provider
PROVIDER_NAME = 'my-cache-provider'
BUG_REPORT_LOCATION = 'https://issues.example.com/report'
def is_available(self) -> bool:
"""
Check if the provider is available (e.g. all required dependencies are available)
This is used to determine if the provider should be used and to provide debug information.
IMPORTANT: This method SHOULD NOT make any network requests or perform any expensive operations.
Since this is called multiple times, we recommend caching the result.
"""
return True
def get(self, key: str):
# Similar to PO Token Providers, Cache Providers and Cache Spec Providers
# are passed down extractor args matching key youtubepot-<PROVIDER_KEY>.
some_setting = self._configuration_arg('some_setting', default=['default_value'])[0]
return self.my_cache.get(key)
def store(self, key: str, value: str, expires_at: int):
# ⚠ expires_at MUST be respected.
# Cache entries should not be returned if they have expired.
self.my_cache.store(key, value, expires_at)
def delete(self, key: str):
self.my_cache.delete(key)
def close(self):
# Optional close hook, called when the YoutubeDL instance is closed.
pass
# If there are multiple PO Token Cache Providers available, you can
# define a preference function to increase/decrease the priority of providers.
# IMPORTANT: Providers should be in preference of cache lookup time.
# For example, a memory cache should have a higher preference than a disk cache.
# VERY IMPORTANT: yt-dlp has a built-in memory cache with a priority of 10000.
# Your cache provider should be lower than this.
@register_preference(MyCacheProviderPCP)
def my_cache_preference(provider: PoTokenCacheProvider, request: PoTokenRequest) -> int:
return 50
```
### Cache Specs
`yt_dlp.extractor.youtube.pot.cache`
These are used to provide information on how to cache a particular PO Token Request.
You might have a different cache spec for different kinds of PO Tokens.
```python
from yt_dlp.extractor.youtube.pot.cache import (
PoTokenCacheSpec,
PoTokenCacheSpecProvider,
CacheProviderWritePolicy,
register_spec,
)
from yt_dlp.utils import traverse_obj
from yt_dlp.extractor.youtube.pot.provider import PoTokenRequest
@register_spec
class MyCacheSpecProviderPCSP(PoTokenCacheSpecProvider): # Provider class name must end with "PCSP"
PROVIDER_VERSION = '0.1.0'
# Define a unique display name for the provider
PROVIDER_NAME = 'mycachespec'
BUG_REPORT_LOCATION = 'https://issues.example.com/report'
def generate_cache_spec(self, request: PoTokenRequest):
client_name = traverse_obj(request.innertube_context, ('client', 'clientName'))
if client_name != 'ANDROID':
# If the request is not supported by the cache spec, return None
return None
# Generate a cache spec for the request
return PoTokenCacheSpec(
# Key bindings to uniquely identify the request. These are used to generate a cache key.
key_bindings={
'client_name': client_name,
'content_binding': 'unique_content_binding',
'ip': traverse_obj(request.innertube_context, ('client', 'remoteHost')),
'source_address': request.request_source_address,
'proxy': request.request_proxy,
},
# Default Cache TTL in seconds
default_ttl=21600,
# Optional: Specify a write policy.
# WRITE_FIRST will write to the highest priority provider only,
# whereas WRITE_ALL will write to all providers.
# WRITE_FIRST may be useful if the PO Token is short-lived
# and there is no use writing to all providers.
write_policy=CacheProviderWritePolicy.WRITE_ALL,
)
```

View File

@@ -0,0 +1,3 @@
# Trigger import of built-in providers
from ._builtin.memory_cache import MemoryLRUPCP as _MemoryLRUPCP # noqa: F401
from ._builtin.webpo_cachespec import WebPoPCSP as _WebPoPCSP # noqa: F401

View File

@@ -0,0 +1,78 @@
from __future__ import annotations
import datetime as dt
import typing
from threading import Lock
from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider
from yt_dlp.extractor.youtube.pot._registry import _pot_memory_cache
from yt_dlp.extractor.youtube.pot.cache import (
PoTokenCacheProvider,
register_preference,
register_provider,
)
def initialize_global_cache(max_size: int):
if _pot_memory_cache.value.get('cache') is None:
_pot_memory_cache.value['cache'] = {}
_pot_memory_cache.value['lock'] = Lock()
_pot_memory_cache.value['max_size'] = max_size
if _pot_memory_cache.value['max_size'] != max_size:
raise ValueError('Cannot change max_size of initialized global memory cache')
return (
_pot_memory_cache.value['cache'],
_pot_memory_cache.value['lock'],
_pot_memory_cache.value['max_size'],
)
@register_provider
class MemoryLRUPCP(PoTokenCacheProvider, BuiltinIEContentProvider):
PROVIDER_NAME = 'memory'
DEFAULT_CACHE_SIZE = 25
def __init__(
self,
*args,
initialize_cache: typing.Callable[[int], tuple[dict[str, tuple[str, int]], Lock, int]] = initialize_global_cache,
**kwargs,
):
super().__init__(*args, **kwargs)
self.cache, self.lock, self.max_size = initialize_cache(self.DEFAULT_CACHE_SIZE)
def is_available(self) -> bool:
return True
def get(self, key: str) -> str | None:
with self.lock:
if key not in self.cache:
return None
value, expires_at = self.cache.pop(key)
if expires_at < int(dt.datetime.now(dt.timezone.utc).timestamp()):
return None
self.cache[key] = (value, expires_at)
return value
def store(self, key: str, value: str, expires_at: int):
with self.lock:
if expires_at < int(dt.datetime.now(dt.timezone.utc).timestamp()):
return
if key in self.cache:
self.cache.pop(key)
self.cache[key] = (value, expires_at)
if len(self.cache) > self.max_size:
oldest_key = next(iter(self.cache))
self.cache.pop(oldest_key)
def delete(self, key: str):
with self.lock:
self.cache.pop(key, None)
@register_preference(MemoryLRUPCP)
def memorylru_preference(*_, **__):
# Memory LRU Cache SHOULD be the highest priority
return 10000

View File

@@ -0,0 +1,48 @@
from __future__ import annotations
from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider
from yt_dlp.extractor.youtube.pot.cache import (
CacheProviderWritePolicy,
PoTokenCacheSpec,
PoTokenCacheSpecProvider,
register_spec,
)
from yt_dlp.extractor.youtube.pot.provider import (
PoTokenRequest,
)
from yt_dlp.extractor.youtube.pot.utils import ContentBindingType, get_webpo_content_binding
from yt_dlp.utils import traverse_obj
@register_spec
class WebPoPCSP(PoTokenCacheSpecProvider, BuiltinIEContentProvider):
PROVIDER_NAME = 'webpo'
def generate_cache_spec(self, request: PoTokenRequest) -> PoTokenCacheSpec | None:
bind_to_visitor_id = self._configuration_arg(
'bind_to_visitor_id', default=['true'])[0] == 'true'
content_binding, content_binding_type = get_webpo_content_binding(
request, bind_to_visitor_id=bind_to_visitor_id)
if not content_binding or not content_binding_type:
return None
write_policy = CacheProviderWritePolicy.WRITE_ALL
if content_binding_type == ContentBindingType.VIDEO_ID:
write_policy = CacheProviderWritePolicy.WRITE_FIRST
return PoTokenCacheSpec(
key_bindings={
't': 'webpo',
'cb': content_binding,
'cbt': content_binding_type.value,
'ip': traverse_obj(request.innertube_context, ('client', 'remoteHost')),
'sa': request.request_source_address,
'px': request.request_proxy,
},
# Integrity token response usually states it has a ttl of 12 hours (43200 seconds).
# We will default to 6 hours to be safe.
default_ttl=21600,
write_policy=write_policy,
)

View File

@@ -0,0 +1,468 @@
from __future__ import annotations
import base64
import binascii
import dataclasses
import datetime as dt
import hashlib
import json
import typing
import urllib.parse
from collections.abc import Iterable
from yt_dlp.extractor.youtube.pot._provider import (
BuiltinIEContentProvider,
IEContentProvider,
IEContentProviderLogger,
)
from yt_dlp.extractor.youtube.pot._registry import (
_pot_cache_provider_preferences,
_pot_cache_providers,
_pot_pcs_providers,
_pot_providers,
_ptp_preferences,
)
from yt_dlp.extractor.youtube.pot.cache import (
CacheProviderWritePolicy,
PoTokenCacheProvider,
PoTokenCacheProviderError,
PoTokenCacheSpec,
PoTokenCacheSpecProvider,
)
from yt_dlp.extractor.youtube.pot.provider import (
PoTokenProvider,
PoTokenProviderError,
PoTokenProviderRejectedRequest,
PoTokenRequest,
PoTokenResponse,
provider_bug_report_message,
)
from yt_dlp.utils import bug_reports_message, format_field, join_nonempty
if typing.TYPE_CHECKING:
from yt_dlp.extractor.youtube.pot.cache import CacheProviderPreference
from yt_dlp.extractor.youtube.pot.provider import Preference
class YoutubeIEContentProviderLogger(IEContentProviderLogger):
def __init__(self, ie, prefix, log_level: IEContentProviderLogger.LogLevel | None = None):
self.__ie = ie
self.prefix = prefix
self.log_level = log_level if log_level is not None else self.LogLevel.INFO
def _format_msg(self, message: str):
prefixstr = format_field(self.prefix, None, '[%s] ')
return f'{prefixstr}{message}'
def trace(self, message: str):
if self.log_level <= self.LogLevel.TRACE:
self.__ie.write_debug(self._format_msg('TRACE: ' + message))
def debug(self, message: str):
if self.log_level <= self.LogLevel.DEBUG:
self.__ie.write_debug(self._format_msg(message))
def info(self, message: str):
if self.log_level <= self.LogLevel.INFO:
self.__ie.to_screen(self._format_msg(message))
def warning(self, message: str, *, once=False):
if self.log_level <= self.LogLevel.WARNING:
self.__ie.report_warning(self._format_msg(message), only_once=once)
def error(self, message: str):
if self.log_level <= self.LogLevel.ERROR:
self.__ie._downloader.report_error(self._format_msg(message), is_error=False)
class PoTokenCache:
def __init__(
self,
logger: IEContentProviderLogger,
cache_providers: list[PoTokenCacheProvider],
cache_spec_providers: list[PoTokenCacheSpecProvider],
cache_provider_preferences: list[CacheProviderPreference] | None = None,
):
self.cache_providers: dict[str, PoTokenCacheProvider] = {
provider.PROVIDER_KEY: provider for provider in (cache_providers or [])}
self.cache_provider_preferences: list[CacheProviderPreference] = cache_provider_preferences or []
self.cache_spec_providers: dict[str, PoTokenCacheSpecProvider] = {
provider.PROVIDER_KEY: provider for provider in (cache_spec_providers or [])}
self.logger = logger
def _get_cache_providers(self, request: PoTokenRequest) -> Iterable[PoTokenCacheProvider]:
"""Sorts available cache providers by preference, given a request"""
preferences = {
provider: sum(pref(provider, request) for pref in self.cache_provider_preferences)
for provider in self.cache_providers.values()
}
if self.logger.log_level <= self.logger.LogLevel.TRACE:
# calling is_available() for every PO Token provider upfront may have some overhead
self.logger.trace(f'PO Token Cache Providers: {provider_display_list(self.cache_providers.values())}')
self.logger.trace('Cache Provider preferences for this request: {}'.format(', '.join(
f'{provider.PROVIDER_KEY}={pref}' for provider, pref in preferences.items())))
return (
provider for provider in sorted(
self.cache_providers.values(), key=preferences.get, reverse=True) if provider.is_available())
def _get_cache_spec(self, request: PoTokenRequest) -> PoTokenCacheSpec | None:
for provider in self.cache_spec_providers.values():
if not provider.is_available():
continue
try:
spec = provider.generate_cache_spec(request)
if not spec:
continue
if not validate_cache_spec(spec):
self.logger.error(
f'PoTokenCacheSpecProvider "{provider.PROVIDER_KEY}" generate_cache_spec() '
f'returned invalid spec {spec}{provider_bug_report_message(provider)}')
continue
spec = dataclasses.replace(spec, _provider=provider)
self.logger.trace(
f'Retrieved cache spec {spec} from cache spec provider "{provider.PROVIDER_NAME}"')
return spec
except Exception as e:
self.logger.error(
f'Error occurred with "{provider.PROVIDER_NAME}" PO Token cache spec provider: '
f'{e!r}{provider_bug_report_message(provider)}')
continue
return None
def _generate_key_bindings(self, spec: PoTokenCacheSpec) -> dict[str, str]:
bindings_cleaned = {
**{k: v for k, v in spec.key_bindings.items() if v is not None},
# Allow us to invalidate caches if such need arises
'_dlp_cache': 'v1',
}
if spec._provider:
bindings_cleaned['_p'] = spec._provider.PROVIDER_KEY
self.logger.trace(f'Generated cache key bindings: {bindings_cleaned}')
return bindings_cleaned
def _generate_key(self, bindings: dict) -> str:
binding_string = ''.join(repr(dict(sorted(bindings.items()))))
return hashlib.sha256(binding_string.encode()).hexdigest()
def get(self, request: PoTokenRequest) -> PoTokenResponse | None:
spec = self._get_cache_spec(request)
if not spec:
self.logger.trace('No cache spec available for this request, unable to fetch from cache')
return None
cache_key = self._generate_key(self._generate_key_bindings(spec))
self.logger.trace(f'Attempting to access PO Token cache using key: {cache_key}')
for idx, provider in enumerate(self._get_cache_providers(request)):
try:
self.logger.trace(
f'Attempting to fetch PO Token response from "{provider.PROVIDER_NAME}" cache provider')
cache_response = provider.get(cache_key)
if not cache_response:
continue
try:
po_token_response = PoTokenResponse(**json.loads(cache_response))
except (TypeError, ValueError, json.JSONDecodeError):
po_token_response = None
if not validate_response(po_token_response):
self.logger.error(
f'Invalid PO Token response retrieved from cache provider "{provider.PROVIDER_NAME}": '
f'{cache_response}{provider_bug_report_message(provider)}')
provider.delete(cache_key)
continue
self.logger.trace(
f'PO Token response retrieved from cache using "{provider.PROVIDER_NAME}" provider: '
f'{po_token_response}')
if idx > 0:
# Write back to the highest priority cache provider,
# so we stop trying to fetch from lower priority providers
self.logger.trace('Writing PO Token response to highest priority cache provider')
self.store(request, po_token_response, write_policy=CacheProviderWritePolicy.WRITE_FIRST)
return po_token_response
except PoTokenCacheProviderError as e:
self.logger.warning(
f'Error from "{provider.PROVIDER_NAME}" PO Token cache provider: '
f'{e!r}{provider_bug_report_message(provider) if not e.expected else ""}')
continue
except Exception as e:
self.logger.error(
f'Error occurred with "{provider.PROVIDER_NAME}" PO Token cache provider: '
f'{e!r}{provider_bug_report_message(provider)}',
)
continue
return None
def store(
self,
request: PoTokenRequest,
response: PoTokenResponse,
write_policy: CacheProviderWritePolicy | None = None,
):
spec = self._get_cache_spec(request)
if not spec:
self.logger.trace('No cache spec available for this request. Not caching.')
return
if not validate_response(response):
self.logger.error(
f'Invalid PO Token response provided to PoTokenCache.store(): '
f'{response}{bug_reports_message()}')
return
cache_key = self._generate_key(self._generate_key_bindings(spec))
self.logger.trace(f'Attempting to access PO Token cache using key: {cache_key}')
default_expires_at = int(dt.datetime.now(dt.timezone.utc).timestamp()) + spec.default_ttl
cache_response = dataclasses.replace(response, expires_at=response.expires_at or default_expires_at)
write_policy = write_policy or spec.write_policy
self.logger.trace(f'Using write policy: {write_policy}')
for idx, provider in enumerate(self._get_cache_providers(request)):
try:
self.logger.trace(
f'Caching PO Token response in "{provider.PROVIDER_NAME}" cache provider '
f'(key={cache_key}, expires_at={cache_response.expires_at})')
provider.store(
key=cache_key,
value=json.dumps(dataclasses.asdict(cache_response)),
expires_at=cache_response.expires_at)
except PoTokenCacheProviderError as e:
self.logger.warning(
f'Error from "{provider.PROVIDER_NAME}" PO Token cache provider: '
f'{e!r}{provider_bug_report_message(provider) if not e.expected else ""}')
except Exception as e:
self.logger.error(
f'Error occurred with "{provider.PROVIDER_NAME}" PO Token cache provider: '
f'{e!r}{provider_bug_report_message(provider)}')
# WRITE_FIRST should not write to lower priority providers in the case the highest priority provider fails
if idx == 0 and write_policy == CacheProviderWritePolicy.WRITE_FIRST:
return
def close(self):
for provider in self.cache_providers.values():
provider.close()
for spec_provider in self.cache_spec_providers.values():
spec_provider.close()
class PoTokenRequestDirector:
def __init__(self, logger: IEContentProviderLogger, cache: PoTokenCache):
self.providers: dict[str, PoTokenProvider] = {}
self.preferences: list[Preference] = []
self.cache = cache
self.logger = logger
def register_provider(self, provider: PoTokenProvider):
self.providers[provider.PROVIDER_KEY] = provider
def register_preference(self, preference: Preference):
self.preferences.append(preference)
def _get_providers(self, request: PoTokenRequest) -> Iterable[PoTokenProvider]:
"""Sorts available providers by preference, given a request"""
preferences = {
provider: sum(pref(provider, request) for pref in self.preferences)
for provider in self.providers.values()
}
if self.logger.log_level <= self.logger.LogLevel.TRACE:
# calling is_available() for every PO Token provider upfront may have some overhead
self.logger.trace(f'PO Token Providers: {provider_display_list(self.providers.values())}')
self.logger.trace('Provider preferences for this request: {}'.format(', '.join(
f'{provider.PROVIDER_NAME}={pref}' for provider, pref in preferences.items())))
return (
provider for provider in sorted(
self.providers.values(), key=preferences.get, reverse=True)
if provider.is_available()
)
def _get_po_token(self, request) -> PoTokenResponse | None:
for provider in self._get_providers(request):
try:
self.logger.trace(
f'Attempting to fetch a PO Token from "{provider.PROVIDER_NAME}" provider')
response = provider.request_pot(request.copy())
except PoTokenProviderRejectedRequest as e:
self.logger.trace(
f'PO Token Provider "{provider.PROVIDER_NAME}" rejected this request, '
f'trying next available provider. Reason: {e}')
continue
except PoTokenProviderError as e:
self.logger.warning(
f'Error fetching PO Token from "{provider.PROVIDER_NAME}" provider: '
f'{e!r}{provider_bug_report_message(provider) if not e.expected else ""}')
continue
except Exception as e:
self.logger.error(
f'Unexpected error when fetching PO Token from "{provider.PROVIDER_NAME}" provider: '
f'{e!r}{provider_bug_report_message(provider)}')
continue
self.logger.trace(f'PO Token response from "{provider.PROVIDER_NAME}" provider: {response}')
if not validate_response(response):
self.logger.error(
f'Invalid PO Token response received from "{provider.PROVIDER_NAME}" provider: '
f'{response}{provider_bug_report_message(provider)}')
continue
return response
self.logger.trace('No PO Token providers were able to provide a valid PO Token')
return None
def get_po_token(self, request: PoTokenRequest) -> str | None:
if not request.bypass_cache:
if pot_response := self.cache.get(request):
return clean_pot(pot_response.po_token)
if not self.providers:
self.logger.trace('No PO Token providers registered')
return None
pot_response = self._get_po_token(request)
if not pot_response:
return None
pot_response.po_token = clean_pot(pot_response.po_token)
if pot_response.expires_at is None or pot_response.expires_at > 0:
self.cache.store(request, pot_response)
else:
self.logger.trace(
f'PO Token response will not be cached (expires_at={pot_response.expires_at})')
return pot_response.po_token
def close(self):
for provider in self.providers.values():
provider.close()
self.cache.close()
EXTRACTOR_ARG_PREFIX = 'youtubepot'
def initialize_pot_director(ie):
assert ie._downloader is not None, 'Downloader not set'
enable_trace = ie._configuration_arg(
'pot_trace', ['false'], ie_key='youtube', casesense=False)[0] == 'true'
if enable_trace:
log_level = IEContentProviderLogger.LogLevel.TRACE
elif ie.get_param('verbose', False):
log_level = IEContentProviderLogger.LogLevel.DEBUG
else:
log_level = IEContentProviderLogger.LogLevel.INFO
def get_provider_logger_and_settings(provider, logger_key):
logger_prefix = f'{logger_key}:{provider.PROVIDER_NAME}'
extractor_key = f'{EXTRACTOR_ARG_PREFIX}-{provider.PROVIDER_KEY.lower()}'
return (
YoutubeIEContentProviderLogger(ie, logger_prefix, log_level=log_level),
ie.get_param('extractor_args', {}).get(extractor_key, {}))
cache_providers = []
for cache_provider in _pot_cache_providers.value.values():
logger, settings = get_provider_logger_and_settings(cache_provider, 'pot:cache')
cache_providers.append(cache_provider(ie, logger, settings))
cache_spec_providers = []
for cache_spec_provider in _pot_pcs_providers.value.values():
logger, settings = get_provider_logger_and_settings(cache_spec_provider, 'pot:cache:spec')
cache_spec_providers.append(cache_spec_provider(ie, logger, settings))
cache = PoTokenCache(
logger=YoutubeIEContentProviderLogger(ie, 'pot:cache', log_level=log_level),
cache_providers=cache_providers,
cache_spec_providers=cache_spec_providers,
cache_provider_preferences=list(_pot_cache_provider_preferences.value),
)
director = PoTokenRequestDirector(
logger=YoutubeIEContentProviderLogger(ie, 'pot', log_level=log_level),
cache=cache,
)
ie._downloader.add_close_hook(director.close)
for provider in _pot_providers.value.values():
logger, settings = get_provider_logger_and_settings(provider, 'pot')
director.register_provider(provider(ie, logger, settings))
for preference in _ptp_preferences.value:
director.register_preference(preference)
if director.logger.log_level <= director.logger.LogLevel.DEBUG:
# calling is_available() for every PO Token provider upfront may have some overhead
director.logger.debug(f'PO Token Providers: {provider_display_list(director.providers.values())}')
director.logger.debug(f'PO Token Cache Providers: {provider_display_list(cache.cache_providers.values())}')
director.logger.debug(f'PO Token Cache Spec Providers: {provider_display_list(cache.cache_spec_providers.values())}')
director.logger.trace(f'Registered {len(director.preferences)} provider preferences')
director.logger.trace(f'Registered {len(cache.cache_provider_preferences)} cache provider preferences')
return director
def provider_display_list(providers: Iterable[IEContentProvider]):
def provider_display_name(provider):
display_str = join_nonempty(
provider.PROVIDER_NAME,
provider.PROVIDER_VERSION if not isinstance(provider, BuiltinIEContentProvider) else None)
statuses = []
if not isinstance(provider, BuiltinIEContentProvider):
statuses.append('external')
if not provider.is_available():
statuses.append('unavailable')
if statuses:
display_str += f' ({", ".join(statuses)})'
return display_str
return ', '.join(provider_display_name(provider) for provider in providers) or 'none'
def clean_pot(po_token: str):
# Clean and validate the PO Token. This will strip invalid characters off
# (e.g. additional url params the user may accidentally include)
try:
return base64.urlsafe_b64encode(
base64.urlsafe_b64decode(urllib.parse.unquote(po_token))).decode()
except (binascii.Error, ValueError):
raise ValueError('Invalid PO Token')
def validate_response(response: PoTokenResponse | None):
if (
not isinstance(response, PoTokenResponse)
or not isinstance(response.po_token, str)
or not response.po_token
): # noqa: SIM103
return False
try:
clean_pot(response.po_token)
except ValueError:
return False
if not isinstance(response.expires_at, int):
return response.expires_at is None
return response.expires_at <= 0 or response.expires_at > int(dt.datetime.now(dt.timezone.utc).timestamp())
def validate_cache_spec(spec: PoTokenCacheSpec):
return (
isinstance(spec, PoTokenCacheSpec)
and isinstance(spec.write_policy, CacheProviderWritePolicy)
and isinstance(spec.default_ttl, int)
and isinstance(spec.key_bindings, dict)
and all(isinstance(k, str) for k in spec.key_bindings)
and all(v is None or isinstance(v, str) for v in spec.key_bindings.values())
and bool([v for v in spec.key_bindings.values() if v is not None])
)

View File

@@ -0,0 +1,156 @@
from __future__ import annotations
import abc
import enum
import functools
from yt_dlp.extractor.common import InfoExtractor
from yt_dlp.utils import NO_DEFAULT, bug_reports_message, classproperty, traverse_obj
from yt_dlp.version import __version__
# xxx: these could be generalized outside YoutubeIE eventually
class IEContentProviderLogger(abc.ABC):
class LogLevel(enum.IntEnum):
TRACE = 0
DEBUG = 10
INFO = 20
WARNING = 30
ERROR = 40
@classmethod
def _missing_(cls, value):
if isinstance(value, str):
value = value.upper()
if value in dir(cls):
return cls[value]
return cls.INFO
log_level = LogLevel.INFO
@abc.abstractmethod
def trace(self, message: str):
pass
@abc.abstractmethod
def debug(self, message: str):
pass
@abc.abstractmethod
def info(self, message: str):
pass
@abc.abstractmethod
def warning(self, message: str, *, once=False):
pass
@abc.abstractmethod
def error(self, message: str):
pass
class IEContentProviderError(Exception):
def __init__(self, msg=None, expected=False):
super().__init__(msg)
self.expected = expected
class IEContentProvider(abc.ABC):
PROVIDER_VERSION: str = '0.0.0'
BUG_REPORT_LOCATION: str = '(developer has not provided a bug report location)'
def __init__(
self,
ie: InfoExtractor,
logger: IEContentProviderLogger,
settings: dict[str, list[str]], *_, **__,
):
self.ie = ie
self.settings = settings or {}
self.logger = logger
super().__init__()
@classmethod
def __init_subclass__(cls, *, suffix=None, **kwargs):
if suffix:
cls._PROVIDER_KEY_SUFFIX = suffix
return super().__init_subclass__(**kwargs)
@classproperty
def PROVIDER_NAME(cls) -> str:
return cls.__name__[:-len(cls._PROVIDER_KEY_SUFFIX)]
@classproperty
def BUG_REPORT_MESSAGE(cls):
return f'please report this issue to the provider developer at {cls.BUG_REPORT_LOCATION} .'
@classproperty
def PROVIDER_KEY(cls) -> str:
assert hasattr(cls, '_PROVIDER_KEY_SUFFIX'), 'Content Provider implementation must define a suffix for the provider key'
assert cls.__name__.endswith(cls._PROVIDER_KEY_SUFFIX), f'PoTokenProvider class names must end with "{cls._PROVIDER_KEY_SUFFIX}"'
return cls.__name__[:-len(cls._PROVIDER_KEY_SUFFIX)]
@abc.abstractmethod
def is_available(self) -> bool:
"""
Check if the provider is available (e.g. all required dependencies are available)
This is used to determine if the provider should be used and to provide debug information.
IMPORTANT: This method should not make any network requests or perform any expensive operations.
It is called multiple times.
"""
raise NotImplementedError
def close(self): # noqa: B027
pass
def _configuration_arg(self, key, default=NO_DEFAULT, *, casesense=False):
"""
@returns A list of values for the setting given by "key"
or "default" if no such key is present
@param default The default value to return when the key is not present (default: [])
@param casesense When false, the values are converted to lower case
"""
val = traverse_obj(self.settings, key)
if val is None:
return [] if default is NO_DEFAULT else default
return list(val) if casesense else [x.lower() for x in val]
class BuiltinIEContentProvider(IEContentProvider, abc.ABC):
PROVIDER_VERSION = __version__
BUG_REPORT_MESSAGE = bug_reports_message(before='')
def register_provider_generic(
provider,
base_class,
registry,
):
"""Generic function to register a provider class"""
assert issubclass(provider, base_class), f'{provider} must be a subclass of {base_class.__name__}'
assert provider.PROVIDER_KEY not in registry, f'{base_class.__name__} {provider.PROVIDER_KEY} already registered'
registry[provider.PROVIDER_KEY] = provider
return provider
def register_preference_generic(
base_class,
registry,
*providers,
):
"""Generic function to register a preference for a provider"""
assert all(issubclass(provider, base_class) for provider in providers)
def outer(preference):
@functools.wraps(preference)
def inner(provider, *args, **kwargs):
if not providers or isinstance(provider, providers):
return preference(provider, *args, **kwargs)
return 0
registry.add(inner)
return preference
return outer

View File

@@ -0,0 +1,8 @@
from yt_dlp.globals import Indirect
_pot_providers = Indirect({})
_ptp_preferences = Indirect(set())
_pot_pcs_providers = Indirect({})
_pot_cache_providers = Indirect({})
_pot_cache_provider_preferences = Indirect(set())
_pot_memory_cache = Indirect({})

View File

@@ -0,0 +1,97 @@
"""PUBLIC API"""
from __future__ import annotations
import abc
import dataclasses
import enum
import typing
from yt_dlp.extractor.youtube.pot._provider import (
IEContentProvider,
IEContentProviderError,
register_preference_generic,
register_provider_generic,
)
from yt_dlp.extractor.youtube.pot._registry import (
_pot_cache_provider_preferences,
_pot_cache_providers,
_pot_pcs_providers,
)
from yt_dlp.extractor.youtube.pot.provider import PoTokenRequest
class PoTokenCacheProviderError(IEContentProviderError):
"""An error occurred while fetching a PO Token"""
class PoTokenCacheProvider(IEContentProvider, abc.ABC, suffix='PCP'):
@abc.abstractmethod
def get(self, key: str) -> str | None:
pass
@abc.abstractmethod
def store(self, key: str, value: str, expires_at: int):
pass
@abc.abstractmethod
def delete(self, key: str):
pass
class CacheProviderWritePolicy(enum.Enum):
WRITE_ALL = enum.auto() # Write to all cache providers
WRITE_FIRST = enum.auto() # Write to only the first cache provider
@dataclasses.dataclass
class PoTokenCacheSpec:
key_bindings: dict[str, str | None]
default_ttl: int
write_policy: CacheProviderWritePolicy = CacheProviderWritePolicy.WRITE_ALL
# Internal
_provider: PoTokenCacheSpecProvider | None = None
class PoTokenCacheSpecProvider(IEContentProvider, abc.ABC, suffix='PCSP'):
def is_available(self) -> bool:
return True
@abc.abstractmethod
def generate_cache_spec(self, request: PoTokenRequest) -> PoTokenCacheSpec | None:
"""Generate a cache spec for the given request"""
pass
def register_provider(provider: type[PoTokenCacheProvider]):
"""Register a PoTokenCacheProvider class"""
return register_provider_generic(
provider=provider,
base_class=PoTokenCacheProvider,
registry=_pot_cache_providers.value,
)
def register_spec(provider: type[PoTokenCacheSpecProvider]):
"""Register a PoTokenCacheSpecProvider class"""
return register_provider_generic(
provider=provider,
base_class=PoTokenCacheSpecProvider,
registry=_pot_pcs_providers.value,
)
def register_preference(
*providers: type[PoTokenCacheProvider]) -> typing.Callable[[CacheProviderPreference], CacheProviderPreference]:
"""Register a preference for a PoTokenCacheProvider"""
return register_preference_generic(
PoTokenCacheProvider,
_pot_cache_provider_preferences.value,
*providers,
)
if typing.TYPE_CHECKING:
CacheProviderPreference = typing.Callable[[PoTokenCacheProvider, PoTokenRequest], int]

View File

@@ -0,0 +1,280 @@
"""PUBLIC API"""
from __future__ import annotations
import abc
import copy
import dataclasses
import enum
import functools
import typing
import urllib.parse
from yt_dlp.cookies import YoutubeDLCookieJar
from yt_dlp.extractor.youtube.pot._provider import (
IEContentProvider,
IEContentProviderError,
register_preference_generic,
register_provider_generic,
)
from yt_dlp.extractor.youtube.pot._registry import _pot_providers, _ptp_preferences
from yt_dlp.networking import Request, Response
from yt_dlp.utils import traverse_obj
from yt_dlp.utils.networking import HTTPHeaderDict
__all__ = [
'ExternalRequestFeature',
'PoTokenContext',
'PoTokenProvider',
'PoTokenProviderError',
'PoTokenProviderRejectedRequest',
'PoTokenRequest',
'PoTokenResponse',
'provider_bug_report_message',
'register_preference',
'register_provider',
]
class PoTokenContext(enum.Enum):
GVS = 'gvs'
PLAYER = 'player'
@dataclasses.dataclass
class PoTokenRequest:
# YouTube parameters
context: PoTokenContext
innertube_context: InnertubeContext
innertube_host: str | None = None
session_index: str | None = None
player_url: str | None = None
is_authenticated: bool = False
video_webpage: str | None = None
internal_client_name: str | None = None
# Content binding parameters
visitor_data: str | None = None
data_sync_id: str | None = None
video_id: str | None = None
# Networking parameters
request_cookiejar: YoutubeDLCookieJar = dataclasses.field(default_factory=YoutubeDLCookieJar)
request_proxy: str | None = None
request_headers: HTTPHeaderDict = dataclasses.field(default_factory=HTTPHeaderDict)
request_timeout: float | None = None
request_source_address: str | None = None
request_verify_tls: bool = True
# Generate a new token, do not used a cached token
# The token should still be cached for future requests
bypass_cache: bool = False
def copy(self):
return dataclasses.replace(
self,
request_headers=HTTPHeaderDict(self.request_headers),
innertube_context=copy.deepcopy(self.innertube_context),
)
@dataclasses.dataclass
class PoTokenResponse:
po_token: str
expires_at: int | None = None
class PoTokenProviderRejectedRequest(IEContentProviderError):
"""Reject the PoTokenRequest (cannot handle the request)"""
class PoTokenProviderError(IEContentProviderError):
"""An error occurred while fetching a PO Token"""
class ExternalRequestFeature(enum.Enum):
PROXY_SCHEME_HTTP = enum.auto()
PROXY_SCHEME_HTTPS = enum.auto()
PROXY_SCHEME_SOCKS4 = enum.auto()
PROXY_SCHEME_SOCKS4A = enum.auto()
PROXY_SCHEME_SOCKS5 = enum.auto()
PROXY_SCHEME_SOCKS5H = enum.auto()
SOURCE_ADDRESS = enum.auto()
DISABLE_TLS_VERIFICATION = enum.auto()
class PoTokenProvider(IEContentProvider, abc.ABC, suffix='PTP'):
# Set to None to disable the check
_SUPPORTED_CONTEXTS: tuple[PoTokenContext] | None = ()
# Innertube Client Name.
# For example, "WEB", "ANDROID", "TVHTML5".
# For a list of WebPO client names, see yt_dlp.extractor.youtube.pot.utils.WEBPO_CLIENTS.
# Also see yt_dlp.extractor.youtube._base.INNERTUBE_CLIENTS
# for a list of client names currently supported by the YouTube extractor.
_SUPPORTED_CLIENTS: tuple[str] | None = ()
# If making external requests to websites (i.e. to youtube.com)
# using another library or service (i.e., not _request_webpage),
# add the request features that are supported.
# If only using _request_webpage to make external requests, set this to None.
_SUPPORTED_EXTERNAL_REQUEST_FEATURES: tuple[ExternalRequestFeature] | None = ()
def __validate_request(self, request: PoTokenRequest):
if not self.is_available():
raise PoTokenProviderRejectedRequest(f'{self.PROVIDER_NAME} is not available')
# Validate request using built-in settings
if (
self._SUPPORTED_CONTEXTS is not None
and request.context not in self._SUPPORTED_CONTEXTS
):
raise PoTokenProviderRejectedRequest(
f'PO Token Context "{request.context}" is not supported by {self.PROVIDER_NAME}')
if self._SUPPORTED_CLIENTS is not None:
client_name = traverse_obj(
request.innertube_context, ('client', 'clientName'))
if client_name not in self._SUPPORTED_CLIENTS:
raise PoTokenProviderRejectedRequest(
f'Client "{client_name}" is not supported by {self.PROVIDER_NAME}. '
f'Supported clients: {", ".join(self._SUPPORTED_CLIENTS) or "none"}')
self.__validate_external_request_features(request)
@functools.cached_property
def _supported_proxy_schemes(self):
return {
scheme: feature
for scheme, feature in {
'http': ExternalRequestFeature.PROXY_SCHEME_HTTP,
'https': ExternalRequestFeature.PROXY_SCHEME_HTTPS,
'socks4': ExternalRequestFeature.PROXY_SCHEME_SOCKS4,
'socks4a': ExternalRequestFeature.PROXY_SCHEME_SOCKS4A,
'socks5': ExternalRequestFeature.PROXY_SCHEME_SOCKS5,
'socks5h': ExternalRequestFeature.PROXY_SCHEME_SOCKS5H,
}.items()
if feature in (self._SUPPORTED_EXTERNAL_REQUEST_FEATURES or [])
}
def __validate_external_request_features(self, request: PoTokenRequest):
if self._SUPPORTED_EXTERNAL_REQUEST_FEATURES is None:
return
if request.request_proxy:
scheme = urllib.parse.urlparse(request.request_proxy).scheme
if scheme.lower() not in self._supported_proxy_schemes:
raise PoTokenProviderRejectedRequest(
f'External requests by "{self.PROVIDER_NAME}" provider do not '
f'support proxy scheme "{scheme}". Supported proxy schemes: '
f'{", ".join(self._supported_proxy_schemes) or "none"}')
if (
request.request_source_address
and ExternalRequestFeature.SOURCE_ADDRESS not in self._SUPPORTED_EXTERNAL_REQUEST_FEATURES
):
raise PoTokenProviderRejectedRequest(
f'External requests by "{self.PROVIDER_NAME}" provider '
f'do not support setting source address')
if (
not request.request_verify_tls
and ExternalRequestFeature.DISABLE_TLS_VERIFICATION not in self._SUPPORTED_EXTERNAL_REQUEST_FEATURES
):
raise PoTokenProviderRejectedRequest(
f'External requests by "{self.PROVIDER_NAME}" provider '
f'do not support ignoring TLS certificate failures')
def request_pot(self, request: PoTokenRequest) -> PoTokenResponse:
self.__validate_request(request)
return self._real_request_pot(request)
@abc.abstractmethod
def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse:
"""To be implemented by subclasses"""
pass
# Helper functions
def _request_webpage(self, request: Request, pot_request: PoTokenRequest | None = None, note=None, **kwargs) -> Response:
"""Make a request using the internal HTTP Client.
Use this instead of calling requests, urllib3 or other HTTP client libraries directly!
YouTube cookies will be automatically applied if this request is made to YouTube.
@param request: The request to make
@param pot_request: The PoTokenRequest to use. Request parameters will be merged from it.
@param note: Custom log message to display when making the request. Set to `False` to disable logging.
Tips:
- Disable proxy (e.g. if calling local service): Request(..., proxies={'all': None})
- Set request timeout: Request(..., extensions={'timeout': 5.0})
"""
req = request.copy()
# Merge some ctx request settings into the request
# Most of these will already be used by the configured ydl instance,
# however, the YouTube extractor may override some.
if pot_request is not None:
req.headers = HTTPHeaderDict(pot_request.request_headers, req.headers)
req.proxies = req.proxies or ({'all': pot_request.request_proxy} if pot_request.request_proxy else {})
if pot_request.request_cookiejar is not None:
req.extensions['cookiejar'] = req.extensions.get('cookiejar', pot_request.request_cookiejar)
if note is not False:
self.logger.info(str(note) if note else 'Requesting webpage')
return self.ie._downloader.urlopen(req)
def register_provider(provider: type[PoTokenProvider]):
"""Register a PoTokenProvider class"""
return register_provider_generic(
provider=provider,
base_class=PoTokenProvider,
registry=_pot_providers.value,
)
def provider_bug_report_message(provider: IEContentProvider, before=';'):
msg = provider.BUG_REPORT_MESSAGE
before = before.rstrip()
if not before or before.endswith(('.', '!', '?')):
msg = msg[0].title() + msg[1:]
return f'{before} {msg}' if before else msg
def register_preference(*providers: type[PoTokenProvider]) -> typing.Callable[[Preference], Preference]:
"""Register a preference for a PoTokenProvider"""
return register_preference_generic(
PoTokenProvider,
_ptp_preferences.value,
*providers,
)
if typing.TYPE_CHECKING:
Preference = typing.Callable[[PoTokenProvider, PoTokenRequest], int]
__all__.append('Preference')
# Barebones innertube context. There may be more fields.
class ClientInfo(typing.TypedDict, total=False):
hl: str | None
gl: str | None
remoteHost: str | None
deviceMake: str | None
deviceModel: str | None
visitorData: str | None
userAgent: str | None
clientName: str
clientVersion: str
osName: str | None
osVersion: str | None
class InnertubeContext(typing.TypedDict, total=False):
client: ClientInfo
request: dict
user: dict

View File

@@ -0,0 +1,73 @@
"""PUBLIC API"""
from __future__ import annotations
import base64
import contextlib
import enum
import re
import urllib.parse
from yt_dlp.extractor.youtube.pot.provider import PoTokenContext, PoTokenRequest
from yt_dlp.utils import traverse_obj
__all__ = ['WEBPO_CLIENTS', 'ContentBindingType', 'get_webpo_content_binding']
WEBPO_CLIENTS = (
'WEB',
'MWEB',
'TVHTML5',
'WEB_EMBEDDED_PLAYER',
'WEB_CREATOR',
'WEB_REMIX',
'TVHTML5_SIMPLY_EMBEDDED_PLAYER',
)
class ContentBindingType(enum.Enum):
VISITOR_DATA = 'visitor_data'
DATASYNC_ID = 'datasync_id'
VIDEO_ID = 'video_id'
VISITOR_ID = 'visitor_id'
def get_webpo_content_binding(
request: PoTokenRequest,
webpo_clients=WEBPO_CLIENTS,
bind_to_visitor_id=False,
) -> tuple[str | None, ContentBindingType | None]:
client_name = traverse_obj(request.innertube_context, ('client', 'clientName'))
if not client_name or client_name not in webpo_clients:
return None, None
if request.context == PoTokenContext.GVS or client_name in ('WEB_REMIX', ):
if request.is_authenticated:
return request.data_sync_id, ContentBindingType.DATASYNC_ID
else:
if bind_to_visitor_id:
visitor_id = _extract_visitor_id(request.visitor_data)
if visitor_id:
return visitor_id, ContentBindingType.VISITOR_ID
return request.visitor_data, ContentBindingType.VISITOR_DATA
elif request.context == PoTokenContext.PLAYER or client_name != 'WEB_REMIX':
return request.video_id, ContentBindingType.VIDEO_ID
return None, None
def _extract_visitor_id(visitor_data):
if not visitor_data:
return None
# Attempt to extract the visitor ID from the visitor_data protobuf
# xxx: ideally should use a protobuf parser
with contextlib.suppress(Exception):
visitor_id = base64.urlsafe_b64decode(
urllib.parse.unquote_plus(visitor_data))[2:13].decode()
# check that visitor id is all letters and numbers
if re.fullmatch(r'[A-Za-z0-9_-]{11}', visitor_id):
return visitor_id
return None