mirror of
				https://github.com/ytdl-org/youtube-dl
				synced 2025-10-31 09:43:32 +00:00 
			
		
		
		
	[wsj] Add new extractor (Fixes #4854)
This commit is contained in:
		
							parent
							
								
									1a6373ef39
								
							
						
					
					
						commit
						9bb8e0a3f9
					
				| @ -156,6 +156,9 @@ class TestUtil(unittest.TestCase): | ||||
|         self.assertEqual( | ||||
|             unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False), | ||||
|             '20141126') | ||||
|         self.assertEqual( | ||||
|             unified_strdate('2/2/2015 6:47:40 PM', day_first=False), | ||||
|             '20150202') | ||||
| 
 | ||||
|     def test_find_xpath_attr(self): | ||||
|         testxml = '''<root> | ||||
|  | ||||
| @ -554,6 +554,7 @@ from .wimp import WimpIE | ||||
| from .wistia import WistiaIE | ||||
| from .worldstarhiphop import WorldStarHipHopIE | ||||
| from .wrzuta import WrzutaIE | ||||
| from .wsj import WSJIE | ||||
| from .xbef import XBefIE | ||||
| from .xboxclips import XboxClipsIE | ||||
| from .xhamster import XHamsterIE | ||||
|  | ||||
| @ -145,6 +145,7 @@ class InfoExtractor(object): | ||||
|     thumbnail:      Full URL to a video thumbnail image. | ||||
|     description:    Full video description. | ||||
|     uploader:       Full name of the video uploader. | ||||
|     creator:        The main artist who created the video. | ||||
|     timestamp:      UNIX timestamp of the moment the video became available. | ||||
|     upload_date:    Video upload date (YYYYMMDD). | ||||
|                     If not explicitly set, calculated from timestamp. | ||||
|  | ||||
							
								
								
									
										89
									
								
								youtube_dl/extractor/wsj.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								youtube_dl/extractor/wsj.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,89 @@ | ||||
| # encoding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .common import InfoExtractor | ||||
| from ..utils import ( | ||||
|     int_or_none, | ||||
|     unified_strdate, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| class WSJIE(InfoExtractor): | ||||
|     _VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P<id>[a-zA-Z0-9-]+)' | ||||
|     IE_DESC = 'Wall Street Journal' | ||||
|     _TEST = { | ||||
|         'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', | ||||
|         'md5': '9747d7a6ebc2f4df64b981e1dde9efa9', | ||||
|         'info_dict': { | ||||
|             'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', | ||||
|             'ext': 'mp4', | ||||
|             'upload_date': '20150202', | ||||
|             'uploader_id': 'bbright', | ||||
|             'creator': 'bbright', | ||||
|             'categories': list,  # a long list | ||||
|             'duration': 90, | ||||
|             'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo', | ||||
|         }, | ||||
|     } | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|         video_id = self._match_id(url) | ||||
| 
 | ||||
|         bitrates = [128, 174, 264, 320, 464, 664, 1264] | ||||
|         api_url = ( | ||||
|             'http://video-api.wsj.com/api-video/find_all_videos.asp?' | ||||
|             'type=guid&count=1&query=%s&' | ||||
|             'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,' | ||||
|             'author,description,name,linkURL,videoStillURL,duration,videoURL,' | ||||
|             'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,' | ||||
|             'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,' | ||||
|             'allthingsd-subsection,sm-section,sm-subsection,provider,' | ||||
|             'formattedCreationDate,keywords,keywordsOmniture,column,editor,' | ||||
|             'emailURL,emailPartnerID,showName,omnitureProgramName,' | ||||
|             'omnitureVideoFormat,linkRelativeURL,touchCastID,' | ||||
|             'omniturePublishDate,%s') % ( | ||||
|                 video_id, ','.join('video%dkMP4Url' % br for br in bitrates)) | ||||
|         info = self._download_json(api_url, video_id)['items'][0] | ||||
| 
 | ||||
|         # Thumbnails are conveniently in the correct format already | ||||
|         thumbnails = info.get('thumbnailList') | ||||
|         creator = info.get('author') | ||||
|         uploader_id = info.get('editor') | ||||
|         categories = info.get('keywords') | ||||
|         duration = int_or_none(info.get('duration')) | ||||
|         upload_date = unified_strdate( | ||||
|             info.get('formattedCreationDate'), day_first=False) | ||||
|         title = info.get('name', info.get('titletag')) | ||||
| 
 | ||||
|         formats = [{ | ||||
|             'format_id': 'f4m', | ||||
|             'format_note': 'f4m (meta URL)', | ||||
|             'url': info['videoURL'], | ||||
|         }] | ||||
|         if info.get('hls'): | ||||
|             formats.extend(self._extract_m3u8_formats( | ||||
|                 info['hls'], video_id, ext='mp4', | ||||
|                 preference=0, entry_protocol='m3u8_native')) | ||||
|         for br in bitrates: | ||||
|             field = 'video%dkMP4Url' % br | ||||
|             if info.get(field): | ||||
|                 formats.append({ | ||||
|                     'format_id': 'mp4-%d' % br, | ||||
|                     'container': 'mp4', | ||||
|                     'tbr': br, | ||||
|                     'url': info[field], | ||||
|                 }) | ||||
|         self._sort_formats(formats) | ||||
| 
 | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'formats': formats, | ||||
|             'thumbnails': thumbnails, | ||||
|             'creator': creator, | ||||
|             'uploader_id': uploader_id, | ||||
|             'duration': duration, | ||||
|             'upload_date': upload_date, | ||||
|             'title': title, | ||||
|             'formats': formats, | ||||
|             'categories': categories, | ||||
|         } | ||||
| @ -701,7 +701,7 @@ def unified_strdate(date_str, day_first=True): | ||||
|     # %z (UTC offset) is only supported in python>=3.2 | ||||
|     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) | ||||
|     # Remove AM/PM + timezone | ||||
|     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str) | ||||
|     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) | ||||
| 
 | ||||
|     format_expressions = [ | ||||
|         '%d %B %Y', | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Philipp Hagemeister
						Philipp Hagemeister