Merge 6f271423e8 into c5098961b0

Update tests
Fix old metadata reference
2024-12-16 06:27:00 +00:00 · 2024-10-20 12:38:13 +00:00 · 2024-10-20 14:38:03 +02:00 · 2024-10-20 13:41:53 +02:00 · 2024-10-20 12:37:48 +02:00 · 2024-10-20 12:28:10 +02:00
1 changed files with 110 additions and 54 deletions
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@ -5,7 +5,7 @@ import json
 import re

 from .common import InfoExtractor
-from ..utils import ExtractorError, join_nonempty, traverse_obj
+from ..utils import ExtractorError, int_or_none, join_nonempty, merge_dicts, traverse_obj, url_or_none, T


 class NPOIE(InfoExtractor):
@ -17,23 +17,23 @@ class NPOIE(InfoExtractor):
        'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/',
        'md5': 'f9ce9c43cc8bc3b8138df1562b99c379',
        'info_dict': {
-            'description': 'Wie is de mol? (2)',
-            'duration': 2439,
-            'ext': 'm4v',
-            'id': 'wie-is-de-mol-2',
+            'title': 'Wie is de mol? (2)',
            'thumbnail': 'https://assets-start.npo.nl/resources/2023/07/01/e723c3cf-3e42-418a-9ba5-f6dbb64b516a.jpg',
-            'title': 'Wie is de mol? (2)'
+            'duration': 2439,
+            'id': 'wie-is-de-mol-2',
+            'description': 'wie-is-de-mol-2',
+            'ext': 'mp4',
        }
    }, {
        'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika',
        'md5': 'c84d054219c4888ed53b4ee3d01b2d93',
        'info_dict': {
-            'id': 'zwart-geld-de-toekomst-komt-uit-afrika',
            'title': 'Zwart geld: de toekomst komt uit Afrika',
-            'ext': 'mp4',
-            'description': 'Zwart geld: de toekomst komt uit Afrika',
            'thumbnail': 'https://assets-start.npo.nl/resources/2023/06/30/d9879593-1944-4249-990c-1561dac14d8e.jpg',
-            'duration': 3000
+            'duration': 3000,
+            'id': 'zwart-geld-de-toekomst-komt-uit-afrika',
+            'description': 'zwart-geld-de-toekomst-komt-uit-afrika',
+            'ext': 'mp4',
        },
    }]

@ -50,30 +50,22 @@ class NPOIE(InfoExtractor):

        program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail',
                                               slug, query={'slug': slug})
-        product_id = program_metadata.get('productId')
-        images = program_metadata.get('images')
-        thumbnail = None
-        for image in images:
-            thumbnail = image.get('url')
-            break
-        title = program_metadata.get('title')
-        descriptions = program_metadata.get('description', {})
-        description = descriptions.get('long') or descriptions.get('short') or descriptions.get('brief')
-        duration = program_metadata.get('durationInSeconds')
-
+        product_id = traverse_obj(program_metadata, 'productId')
        if not product_id:
-            raise ExtractorError('No productId found for slug: %s' % slug)
-
+            raise ExtractorError('No productId found for slug: %s' % (slug,))
        formats = self._extract_formats_by_product_id(product_id, slug, url)
-
-        return {
+        self._sort_formats(formats)
+        return merge_dicts(traverse_obj(program_metadata, {
+            'title': 'title',
+            'description': (('description', ('long', 'short', 'brief')), 'title'),
+            'thumbnail': ('images', Ellipsis, 'url', T(url_or_none)),
+            'duration': ('durationInSeconds', T(int_or_none)),
+        }, get_all=False), {
            'id': slug,
            'formats': formats,
-            'title': title or slug,
-            'description': description or title or slug,
-            'thumbnail': thumbnail,
-            'duration': duration,
-        }
+            'title': slug,
+            'description': slug,
+        })

    def _extract_formats_by_product_id(self, product_id, slug, url=None):
        token = self._get_token(product_id)
@ -118,6 +110,70 @@ class BNNVaraIE(NPOIE):
    def _real_extract(self, url):
        url = url.rstrip('/')
        video_id = url.split('/')[-1]
+        graphql_query = """query getMedia($id: ID!, $mediaUrl: String, $hasAdConsent: Boolean!, $atInternetId: Int) {
+                            player(
+                                id: $id
+                                mediaUrl: $mediaUrl
+                                hasAdConsent: $hasAdConsent
+                                atInternetId: $atInternetId
+                            ) {
+                            ... on PlayerSucces {
+                                brand {
+                                    name
+                                    slug
+                                    broadcastsEnabled
+                                    __typename
+                                }
+                                title
+                                programTitle
+                                pomsProductId
+                                broadcasters {
+                                    name
+                                    __typename
+                                }
+                                duration
+                                classifications {
+                                    title
+                                    imageUrl
+                                    type
+                                    __typename
+                                }
+                                image {
+                                    title
+                                    url
+                                    __typename
+                                }
+                                cta {
+                                    title
+                                    url
+                                    __typename
+                                }
+                                genres {
+                                    name
+                                    __typename
+                                }
+                                subtitles {
+                                    url
+                                    language
+                                    __typename
+                                }
+                                sources {
+                                    name
+                                    url
+                                    ratio
+                                    __typename
+                                }
+                                    type
+                                    token
+                                    __typename
+                                }
+                                ... on PlayerError {
+                                    error
+                                    __typename
+                                }
+                                    __typename
+                            }
+}"""

        media = self._download_json('https://api.bnnvara.nl/bff/graphql',
                                    video_id,
@ -129,14 +185,15 @@ class BNNVaraIE(NPOIE):
                                                'hasAdConsent': False,
                                                'atInternetId': 70
                                            },
-                                            'query': 'query getMedia($id: ID!, $mediaUrl: String, $hasAdConsent: Boolean!, $atInternetId: Int) {\n  player(\n    id: $id\n    mediaUrl: $mediaUrl\n    hasAdConsent: $hasAdConsent\n    atInternetId: $atInternetId\n  ) {\n    ... on PlayerSucces {\n      brand {\n        name\n        slug\n        broadcastsEnabled\n        __typename\n      }\n      title\n      programTitle\n      pomsProductId\n      broadcasters {\n        name\n        __typename\n      }\n      duration\n      classifications {\n        title\n        imageUrl\n        type\n        __typename\n      }\n      image {\n        title\n        url\n        __typename\n      }\n      cta {\n        title\n        url\n        __typename\n      }\n      genres {\n        name\n        __typename\n      }\n      subtitles {\n        url\n        language\n        __typename\n      }\n      sources {\n        name\n        url\n        ratio\n        __typename\n      }\n      type\n      token\n      __typename\n    }\n    ... on PlayerError {\n      error\n      __typename\n    }\n    __typename\n  }\n}'
+                                            'query': graphql_query
                                        }).encode('utf8'),
                                    headers={
                                        'Content-Type': 'application/json',
                                    })
-        product_id = media.get('data', {}).get('player', {}).get('pomsProductId')

-        formats = self._extract_formats_by_product_id(product_id, video_id)
+        product_id = traverse_obj(media, ('data', 'player', 'pomsProductId'))
+        formats = self._extract_formats_by_product_id(product_id, video_id) if product_id else []
+        self._sort_formats(formats)

        return {
            'id': product_id,
@ -154,7 +211,9 @@ class ONIE(NPOIE):
        'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/',
        'md5': 'a85ebd50fa86fe5cbce654655f7dbb12',
        'info_dict': {
-
+            'id': 'heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel',
+            'title': 'heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel',
+            'ext': 'mp4',
        }
    }]

@ -166,9 +225,7 @@ class ONIE(NPOIE):
        for result in results:
            formats.extend(self._extract_formats_by_product_id(result, video_id))

-        if not formats:
-            raise ExtractorError('Could not find a POMS product id in the provided URL, '
-                                 'perhaps because all stream URLs are DRM protected.')
+        self._sort_formats(formats)

        return {
            'id': video_id,
@ -188,6 +245,7 @@ class ZAPPIE(NPOIE):
        'info_dict': {
            'id': 'POMS_AT_811523',
            'title': 'POMS_AT_811523',
+            'ext': 'mp4',
        },
    }]

@ -213,21 +271,23 @@ class SchoolTVIE(NPOIE):
        'md5': 'e9ef151c4886994e2bea23593348cb14',
        'info_dict': {
            'id': 'zapp-music-challenge-2015-zapp-music-challenge-2015',
-            'title': 'Zapp Music Challenge 2015 - Alain Clark & Yaell',
-            'description': "Een nummer schrijven met de super bekende soulzanger en producer Alain Clark? Dat is de uitdaging voor de dertienjarige Yaell uit Delft. En als het dan echt goed is, mag hij het ook nog eens live gaan spelen op de speelplaats bij Giel Beelen! Muziek is heel erg belangrijk in het leven van Yaell. 'Als er geen muziek zou zijn, dan zou ik heel veel niet kunnen.' Hij is dan ook altijd aan het schrijven, vaak over zijn eigen leven. Maar soms is het best lastig om die teksten te verzinnen. Vindt hij de inspiratie om een hit te maken met Alain?"
+            'title': 'Zapp Music Challenge 2015-Alain Clark & Yaell',
+            'description': "Een nummer schrijven met de super bekende soulzanger en producer Alain Clark? Dat is de uitdaging voor de dertienjarige Yaell uit Delft. En als het dan echt goed is, mag hij het ook nog eens live gaan spelen op de speelplaats bij Giel Beelen! Muziek is heel erg belangrijk in het leven van Yaell. 'Als er geen muziek zou zijn, dan zou ik heel veel niet kunnen.' Hij is dan ook altijd aan het schrijven, vaak over zijn eigen leven. Maar soms is het best lastig om die teksten te verzinnen. Vindt hij de inspiratie om een hit te maken met Alain?",
+            'ext': 'mp4',
        },
    }]

    def _real_extract(self, url):
        video_id = url.rstrip('/').split('/')[-1]

-        # TODO Find out how we could obtain this automatically
-        #      Otherwise this extractor might break each time SchoolTV deploys a new release
-        build_id = 'b7eHUzAVO7wHXCopYxQhV'
+        build_id = self._search_nextjs_data(
+            self._download_webpage(url, video_id),
+            video_id,
+        )['buildId']

        metadata_url = 'https://schooltv.nl/_next/data/' \
                       + build_id \
-                       + '/item/' \
+                       + '/video-item/' \
                       + video_id + '.json'

        metadata = self._download_json(metadata_url,
@ -235,9 +295,7 @@ class SchoolTVIE(NPOIE):

        formats = self._extract_formats_by_product_id(metadata.get('poms_mid'), video_id)

-        if not formats:
-            raise ExtractorError('Could not find a POMS product id in the provided URL, '
-                                 'perhaps because all stream URLs are DRM protected.')
+        self._sort_formats(formats)

        return {
            'id': video_id,
@ -258,9 +316,7 @@ class NTRSubsiteIE(NPOIE):
            formats.extend(self._extract_formats_by_product_id(result, video_id))
            break

-        if not formats:
-            raise ExtractorError('Could not find a POMS product id in the provided URL, '
-                                 'perhaps because all stream URLs are DRM protected.')
+        self._sort_formats(formats)

        return {
            'id': video_id,
@ -279,6 +335,7 @@ class HetKlokhuisIE(NTRSubsiteIE):
        'info_dict': {
            'id': 'aliens',
            'title': 'aliens',
+            'ext': 'mp4',
        },
    }]

@ -293,7 +350,7 @@ class VPROIE(NPOIE):
        'info_dict': {
            'id': 'offline-als-luxe.html',
            'title': 'offline-als-luxe.html',
-            'ext': 'm4v',
+            'ext': 'mp4',
        },
    }]

@ -304,11 +361,9 @@ class VPROIE(NPOIE):
        formats = []
        for result in results:
            formats.extend(self._extract_formats_by_product_id(result, video_id))
-            break  # TODO find a better solution, VPRO pages can have multiple videos embedded
+            break

-        if not formats:
-            raise ExtractorError('Could not find a POMS product id in the provided URL, '
-                                 'perhaps because all stream URLs are DRM protected.')
+        self._sort_formats(formats)

        return {
            'id': video_id,
@ -327,5 +382,6 @@ class AndereTijdenIE(NTRSubsiteIE):
        'info_dict': {
            'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem',
            'title': 'Duitse-soldaten-over-de-Slag-bij-Arnhem',
+            'ext': 'mp4',
        },
    }]
Author	SHA1	Message	Date
Bart Broere	0f551272a3	Merge `6f271423e8` into `c5098961b0`	2024-10-20 12:38:13 +00:00
Bart Broere	6f271423e8	Update tests	2024-10-20 14:38:03 +02:00
Bart Broere	75266ce4ed	Fix old metadata reference	2024-10-20 13:41:53 +02:00
Bart Broere	817e2e5938	Fix some missing imports	2024-10-20 12:37:48 +02:00
Bart Broere	0e1a0cfa03	Apply some more PR feedback	2024-10-20 12:28:10 +02:00
Bart Broere	7f1c09bea1	Use _sort_formats util	2024-10-20 12:11:30 +02:00
Bart Broere	c3026dd70c	Apply suggestion from PR	2024-10-20 12:08:50 +02:00
Bart Broere	41157b2b49	Move GraphQL query into separate variable	2024-10-20 12:00:44 +02:00
Bart Broere	c748eca829	Automatically obtain NextJS buildId and change item to video-item	2024-10-20 11:49:09 +02:00