[cinchcast] Add new extractor (Fixes #4428)

2014-12-12 02:57:36 +01:00 · 2014-12-12 02:57:36 +01:00 · 42bdd9d051
commit 42bdd9d051
parent 4e40de6e2a
5 changed files with 88 additions and 6 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -144,6 +144,9 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
        self.assertEqual(unified_strdate('1968-12-10'), '19681210')
        self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128')
+        self.assertEqual(
+            unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False),
+            '20141126')

    def test_find_xpath_attr(self):
        testxml = '''<root>
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -51,6 +51,7 @@ from .cbsnews import CBSNewsIE
 from .ceskatelevize import CeskaTelevizeIE
 from .channel9 import Channel9IE
 from .chilloutzone import ChilloutzoneIE
+from .cinchcast import CinchcastIE
 from .clipfish import ClipfishIE
 from .cliphunter import CliphunterIE
 from .clipsyndicate import ClipsyndicateIE
--- a/youtube_dl/extractor/cinchcast.py
+++ b/youtube_dl/extractor/cinchcast.py
@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+    xpath_text,
+)
+
+
+class CinchcastIE(InfoExtractor):
+    _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)'
+    _TEST = {
+        # Actual test is run in generic, look for undergroundwellness
+        'url': 'http://player.cinchcast.com/?platformId=1&#038;assetType=single&#038;assetId=7141703',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        doc = self._download_xml(
+            'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id,
+            video_id)
+
+        item = doc.find('.//item')
+        title = xpath_text(item, './title', fatal=True)
+        date_str = xpath_text(
+            item, './{http://developer.longtailvideo.com/trac/}date')
+        upload_date = unified_strdate(date_str, day_first=False)
+        # duration is present but wrong
+        formats = []
+        formats.append({
+            'format_id': 'main',
+            'url': item.find(
+                './{http://search.yahoo.com/mrss/}content').attrib['url'],
+        })
+        backup_url = xpath_text(
+            item, './{http://developer.longtailvideo.com/trac/}backupContent')
+        if backup_url:
+            formats.append({
+                'preference': 2,  # seems to be more reliable
+                'format_id': 'backup',
+                'url': backup_url,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'upload_date': upload_date,
+            'formats': formats,
+        }
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -467,8 +467,17 @@ class GenericIE(InfoExtractor):
            'expected_warnings': [
                'URL could be a direct video link, returning it as such.'
            ]
-        }
-
+        },
+        # Cinchcast embed
+        {
+            'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
+            'info_dict': {
+                'id': '7141703',
+                'ext': 'mp3',
+                'upload_date': '20141126',
+                'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
+            }
+        },
    ]

    def report_following_redirect(self, new_url):
@ -962,6 +971,13 @@ class GenericIE(InfoExtractor):
        if mobj is not None:
            return self.url_result(mobj.group('url'), 'SBS')

+        # Look for embedded Cinchcast player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Cinchcast')
+
        mobj = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
            webpage)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -166,7 +166,7 @@ def xpath_text(node, xpath, name=None, fatal=False):
        xpath = xpath.encode('ascii')

    n = node.find(xpath)
-    if n is None:
+    if n is None or n.text is None:
        if fatal:
            name = xpath if name is None else name
            raise ExtractorError('Could not find XML element %s' % name)
@ -644,17 +644,19 @@ def parse_iso8601(date_str, delimiter='T'):
    return calendar.timegm(dt.timetuple())


-def unified_strdate(date_str):
+def unified_strdate(date_str, day_first=True):
    """Return a string with the date in the format YYYYMMDD"""

    if date_str is None:
        return None
-
    upload_date = None
    # Replace commas
    date_str = date_str.replace(',', ' ')
    # %z (UTC offset) is only supported in python>=3.2
    date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
+    # Remove AM/PM + timezone
+    date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
+
    format_expressions = [
        '%d %B %Y',
        '%d %b %Y',
@ -669,7 +671,6 @@ def unified_strdate(date_str):
        '%d/%m/%Y',
        '%d/%m/%y',
        '%Y/%m/%d %H:%M:%S',
-        '%d/%m/%Y %H:%M:%S',
        '%Y-%m-%d %H:%M:%S',
        '%Y-%m-%d %H:%M:%S.%f',
        '%d.%m.%Y %H:%M',
@ -681,6 +682,14 @@ def unified_strdate(date_str):
        '%Y-%m-%dT%H:%M:%S.%f',
        '%Y-%m-%dT%H:%M',
    ]
+    if day_first:
+        format_expressions.extend([
+            '%d/%m/%Y %H:%M:%S',
+        ])
+    else:
+        format_expressions.extend([
+            '%m/%d/%Y %H:%M:%S',
+        ])
    for expression in format_expressions:
        try:
            upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')