[funnyordie] Extract more metadata (closes #13677)

2017-07-20 22:49:52 +07:00 · 2017-07-20 22:49:52 +07:00 · c653326a14
parent 3fcf346ac1
commit c653326a14
1 changed files with 56 additions and 8 deletions
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@ -1,10 +1,14 @@
 from __future__ import unicode_literals
 import json
 import re
 from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
    ExtractorError,
    float_or_none,
    int_or_none,
    unified_timestamp,
 )
 class FunnyOrDieIE(InfoExtractor):
@ -18,6 +22,10 @@ class FunnyOrDieIE(InfoExtractor):
            'title': 'Heart-Shaped Box: Literal Video Version',
            'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
            'thumbnail': r're:^http:.*\.jpg$',
            'uploader': 'DASjr',
            'timestamp': 1317904928,
            'upload_date': '20111006',
            'duration': 318.3,
        },
    }, {
        'url': 'http://www.funnyordie.com/embed/e402820827',
@ -27,6 +35,8 @@ class FunnyOrDieIE(InfoExtractor):
            'title': 'Please Use This Song (Jon Lajoie)',
            'description': 'Please use this to sell something.  www.jonlajoie.com',
            'thumbnail': r're:^http:.*\.jpg$',
            'timestamp': 1398988800,
            'upload_date': '20140502',
        },
        'params': {
            'skip_download': True,
@ -100,15 +110,53 @@ class FunnyOrDieIE(InfoExtractor):
                'url': 'http://www.funnyordie.com%s' % src,
            }]
-        post_json = self._search_regex(
+        timestamp = unified_timestamp(self._html_search_meta(
-            r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')
+            'uploadDate', webpage, 'timestamp', default=None))
-        post = json.loads(post_json)
+
        uploader = self._html_search_regex(
            r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h',
            webpage, 'uploader', default=None)
        title, description, thumbnail, duration = [None] * 4
        medium = self._parse_json(
            self._search_regex(
                r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium',
                default='{}'),
            video_id, fatal=False)
        if medium:
            title = medium.get('title')
            duration = float_or_none(medium.get('duration'))
            if not timestamp:
                timestamp = unified_timestamp(medium.get('publishDate'))
        post = self._parse_json(
            self._search_regex(
                r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details',
                default='{}'),
            video_id, fatal=False)
        if post:
            if not title:
                title = post.get('name')
            description = post.get('description')
            thumbnail = post.get('picture')
        if not title:
            title = self._og_search_title(webpage)
        if not description:
            description = self._og_search_description(webpage)
        if not duration:
            duration = int_or_none(self._html_search_meta(
                ('video:duration', 'duration'), webpage, 'duration', default=False))
        return {
            'id': video_id,
-            'title': post['name'],
+            'title': title,
-            'description': post.get('description'),
+            'description': description,
-            'thumbnail': post.get('picture'),
+            'thumbnail': thumbnail,
            'uploader': uploader,
            'timestamp': timestamp,
            'duration': duration,
            'formats': formats,
            'subtitles': subtitles,
        }