youtube-dl/bin/0000755000000000000000000000000012641030331012406 5ustar rootrootyoutube-dl/bin/youtube-dl0000755000000000000000000000013312641030331014422 0ustar rootroot#!/usr/bin/env python import youtube_dl if __name__ == '__main__': youtube_dl.main() youtube-dl/devscripts/0000755000000000000000000000000012641030331014024 5ustar rootrootyoutube-dl/devscripts/buildserver.py0000644000000000000000000003160212641030331016726 0ustar rootroot#!/usr/bin/python3 from http.server import HTTPServer, BaseHTTPRequestHandler from socketserver import ThreadingMixIn import argparse import ctypes import functools import sys import threading import traceback import os.path class BuildHTTPServer(ThreadingMixIn, HTTPServer): allow_reuse_address = True advapi32 = ctypes.windll.advapi32 SC_MANAGER_ALL_ACCESS = 0xf003f SC_MANAGER_CREATE_SERVICE = 0x02 SERVICE_WIN32_OWN_PROCESS = 0x10 SERVICE_AUTO_START = 0x2 SERVICE_ERROR_NORMAL = 0x1 DELETE = 0x00010000 SERVICE_STATUS_START_PENDING = 0x00000002 SERVICE_STATUS_RUNNING = 0x00000004 SERVICE_ACCEPT_STOP = 0x1 SVCNAME = 'youtubedl_builder' LPTSTR = ctypes.c_wchar_p START_CALLBACK = ctypes.WINFUNCTYPE(None, ctypes.c_int, ctypes.POINTER(LPTSTR)) class SERVICE_TABLE_ENTRY(ctypes.Structure): _fields_ = [ ('lpServiceName', LPTSTR), ('lpServiceProc', START_CALLBACK) ] HandlerEx = ctypes.WINFUNCTYPE( ctypes.c_int, # return ctypes.c_int, # dwControl ctypes.c_int, # dwEventType ctypes.c_void_p, # lpEventData, ctypes.c_void_p, # lpContext, ) def _ctypes_array(c_type, py_array): ar = (c_type * len(py_array))() ar[:] = py_array return ar def win_OpenSCManager(): res = advapi32.OpenSCManagerW(None, None, SC_MANAGER_ALL_ACCESS) if not res: raise Exception('Opening service manager failed - ' 'are you running this as administrator?') return res def win_install_service(service_name, cmdline): manager = win_OpenSCManager() try: h = advapi32.CreateServiceW( manager, service_name, None, SC_MANAGER_CREATE_SERVICE, SERVICE_WIN32_OWN_PROCESS, SERVICE_AUTO_START, SERVICE_ERROR_NORMAL, cmdline, None, None, None, None, None) if not h: raise OSError('Service creation failed: %s' % ctypes.FormatError()) advapi32.CloseServiceHandle(h) finally: advapi32.CloseServiceHandle(manager) def win_uninstall_service(service_name): manager = win_OpenSCManager() try: h = advapi32.OpenServiceW(manager, service_name, DELETE) if not h: raise OSError('Could not find service %s: %s' % ( service_name, ctypes.FormatError())) try: if not advapi32.DeleteService(h): raise OSError('Deletion failed: %s' % ctypes.FormatError()) finally: advapi32.CloseServiceHandle(h) finally: advapi32.CloseServiceHandle(manager) def win_service_report_event(service_name, msg, is_error=True): with open('C:/sshkeys/log', 'a', encoding='utf-8') as f: f.write(msg + '\n') event_log = advapi32.RegisterEventSourceW(None, service_name) if not event_log: raise OSError('Could not report event: %s' % ctypes.FormatError()) try: type_id = 0x0001 if is_error else 0x0004 event_id = 0xc0000000 if is_error else 0x40000000 lines = _ctypes_array(LPTSTR, [msg]) if not advapi32.ReportEventW( event_log, type_id, 0, event_id, None, len(lines), 0, lines, None): raise OSError('Event reporting failed: %s' % ctypes.FormatError()) finally: advapi32.DeregisterEventSource(event_log) def win_service_handler(stop_event, *args): try: raise ValueError('Handler called with args ' + repr(args)) TODO except Exception as e: tb = traceback.format_exc() msg = str(e) + '\n' + tb win_service_report_event(service_name, msg, is_error=True) raise def win_service_set_status(handle, status_code): svcStatus = SERVICE_STATUS() svcStatus.dwServiceType = SERVICE_WIN32_OWN_PROCESS svcStatus.dwCurrentState = status_code svcStatus.dwControlsAccepted = SERVICE_ACCEPT_STOP svcStatus.dwServiceSpecificExitCode = 0 if not advapi32.SetServiceStatus(handle, ctypes.byref(svcStatus)): raise OSError('SetServiceStatus failed: %r' % ctypes.FormatError()) def win_service_main(service_name, real_main, argc, argv_raw): try: # args = [argv_raw[i].value for i in range(argc)] stop_event = threading.Event() handler = HandlerEx(functools.partial(stop_event, win_service_handler)) h = advapi32.RegisterServiceCtrlHandlerExW(service_name, handler, None) if not h: raise OSError('Handler registration failed: %s' % ctypes.FormatError()) TODO except Exception as e: tb = traceback.format_exc() msg = str(e) + '\n' + tb win_service_report_event(service_name, msg, is_error=True) raise def win_service_start(service_name, real_main): try: cb = START_CALLBACK( functools.partial(win_service_main, service_name, real_main)) dispatch_table = _ctypes_array(SERVICE_TABLE_ENTRY, [ SERVICE_TABLE_ENTRY( service_name, cb ), SERVICE_TABLE_ENTRY(None, ctypes.cast(None, START_CALLBACK)) ]) if not advapi32.StartServiceCtrlDispatcherW(dispatch_table): raise OSError('ctypes start failed: %s' % ctypes.FormatError()) except Exception as e: tb = traceback.format_exc() msg = str(e) + '\n' + tb win_service_report_event(service_name, msg, is_error=True) raise def main(args=None): parser = argparse.ArgumentParser() parser.add_argument('-i', '--install', action='store_const', dest='action', const='install', help='Launch at Windows startup') parser.add_argument('-u', '--uninstall', action='store_const', dest='action', const='uninstall', help='Remove Windows service') parser.add_argument('-s', '--service', action='store_const', dest='action', const='service', help='Run as a Windows service') parser.add_argument('-b', '--bind', metavar='', action='store', default='localhost:8142', help='Bind to host:port (default %default)') options = parser.parse_args(args=args) if options.action == 'install': fn = os.path.abspath(__file__).replace('v:', '\\\\vboxsrv\\vbox') cmdline = '%s %s -s -b %s' % (sys.executable, fn, options.bind) win_install_service(SVCNAME, cmdline) return if options.action == 'uninstall': win_uninstall_service(SVCNAME) return if options.action == 'service': win_service_start(SVCNAME, main) return host, port_str = options.bind.split(':') port = int(port_str) print('Listening on %s:%d' % (host, port)) srv = BuildHTTPServer((host, port), BuildHTTPRequestHandler) thr = threading.Thread(target=srv.serve_forever) thr.start() input('Press ENTER to shut down') srv.shutdown() thr.join() def rmtree(path): for name in os.listdir(path): fname = os.path.join(path, name) if os.path.isdir(fname): rmtree(fname) else: os.chmod(fname, 0o666) os.remove(fname) os.rmdir(path) #============================================================================== class BuildError(Exception): def __init__(self, output, code=500): self.output = output self.code = code def __str__(self): return self.output class HTTPError(BuildError): pass class PythonBuilder(object): def __init__(self, **kwargs): pythonVersion = kwargs.pop('python', '2.7') try: key = _winreg.OpenKey(_winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\Python\PythonCore\%s\InstallPath' % pythonVersion) try: self.pythonPath, _ = _winreg.QueryValueEx(key, '') finally: _winreg.CloseKey(key) except Exception: raise BuildError('No such Python version: %s' % pythonVersion) super(PythonBuilder, self).__init__(**kwargs) class GITInfoBuilder(object): def __init__(self, **kwargs): try: self.user, self.repoName = kwargs['path'][:2] self.rev = kwargs.pop('rev') except ValueError: raise BuildError('Invalid path') except KeyError as e: raise BuildError('Missing mandatory parameter "%s"' % e.args[0]) path = os.path.join(os.environ['APPDATA'], 'Build archive', self.repoName, self.user) if not os.path.exists(path): os.makedirs(path) self.basePath = tempfile.mkdtemp(dir=path) self.buildPath = os.path.join(self.basePath, 'build') super(GITInfoBuilder, self).__init__(**kwargs) class GITBuilder(GITInfoBuilder): def build(self): try: subprocess.check_output(['git', 'clone', 'git://github.com/%s/%s.git' % (self.user, self.repoName), self.buildPath]) subprocess.check_output(['git', 'checkout', self.rev], cwd=self.buildPath) except subprocess.CalledProcessError as e: raise BuildError(e.output) super(GITBuilder, self).build() class YoutubeDLBuilder(object): authorizedUsers = ['fraca7', 'phihag', 'rg3', 'FiloSottile'] def __init__(self, **kwargs): if self.repoName != 'youtube-dl': raise BuildError('Invalid repository "%s"' % self.repoName) if self.user not in self.authorizedUsers: raise HTTPError('Unauthorized user "%s"' % self.user, 401) super(YoutubeDLBuilder, self).__init__(**kwargs) def build(self): try: subprocess.check_output([os.path.join(self.pythonPath, 'python.exe'), 'setup.py', 'py2exe'], cwd=self.buildPath) except subprocess.CalledProcessError as e: raise BuildError(e.output) super(YoutubeDLBuilder, self).build() class DownloadBuilder(object): def __init__(self, **kwargs): self.handler = kwargs.pop('handler') self.srcPath = os.path.join(self.buildPath, *tuple(kwargs['path'][2:])) self.srcPath = os.path.abspath(os.path.normpath(self.srcPath)) if not self.srcPath.startswith(self.buildPath): raise HTTPError(self.srcPath, 401) super(DownloadBuilder, self).__init__(**kwargs) def build(self): if not os.path.exists(self.srcPath): raise HTTPError('No such file', 404) if os.path.isdir(self.srcPath): raise HTTPError('Is a directory: %s' % self.srcPath, 401) self.handler.send_response(200) self.handler.send_header('Content-Type', 'application/octet-stream') self.handler.send_header('Content-Disposition', 'attachment; filename=%s' % os.path.split(self.srcPath)[-1]) self.handler.send_header('Content-Length', str(os.stat(self.srcPath).st_size)) self.handler.end_headers() with open(self.srcPath, 'rb') as src: shutil.copyfileobj(src, self.handler.wfile) super(DownloadBuilder, self).build() class CleanupTempDir(object): def build(self): try: rmtree(self.basePath) except Exception as e: print('WARNING deleting "%s": %s' % (self.basePath, e)) super(CleanupTempDir, self).build() class Null(object): def __init__(self, **kwargs): pass def start(self): pass def close(self): pass def build(self): pass class Builder(PythonBuilder, GITBuilder, YoutubeDLBuilder, DownloadBuilder, CleanupTempDir, Null): pass class BuildHTTPRequestHandler(BaseHTTPRequestHandler): actionDict = {'build': Builder, 'download': Builder} # They're the same, no more caching. def do_GET(self): path = urlparse.urlparse(self.path) paramDict = dict([(key, value[0]) for key, value in urlparse.parse_qs(path.query).items()]) action, _, path = path.path.strip('/').partition('/') if path: path = path.split('/') if action in self.actionDict: try: builder = self.actionDict[action](path=path, handler=self, **paramDict) builder.start() try: builder.build() finally: builder.close() except BuildError as e: self.send_response(e.code) msg = unicode(e).encode('UTF-8') self.send_header('Content-Type', 'text/plain; charset=UTF-8') self.send_header('Content-Length', len(msg)) self.end_headers() self.wfile.write(msg) except HTTPError as e: self.send_response(e.code, str(e)) else: self.send_response(500, 'Unknown build method "%s"' % action) else: self.send_response(500, 'Malformed URL') #============================================================================== if __name__ == '__main__': main() youtube-dl/devscripts/SizeOfImage.patch0000644000000000000000000000022312641030331017204 0ustar rootrootBSDIFF4023DBZh91AY&SYgmDD`@ !`ЊeH lMBZh91AY&SY> M l %rE8P>BZh9rE8Pyoutube-dl/devscripts/wine-py2exe.sh0000755000000000000000000000365112641030331016544 0ustar rootroot#!/bin/bash # Run with as parameter a setup.py that works in the current directory # e.g. no os.chdir() # It will run twice, the first time will crash set -e SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" if [ ! -d wine-py2exe ]; then sudo apt-get install wine1.3 axel bsdiff mkdir wine-py2exe cd wine-py2exe export WINEPREFIX=`pwd` axel -a "http://www.python.org/ftp/python/2.7/python-2.7.msi" axel -a "http://downloads.sourceforge.net/project/py2exe/py2exe/0.6.9/py2exe-0.6.9.win32-py2.7.exe" #axel -a "http://winetricks.org/winetricks" # http://appdb.winehq.org/objectManager.php?sClass=version&iId=21957 echo "Follow python setup on screen" wine msiexec /i python-2.7.msi echo "Follow py2exe setup on screen" wine py2exe-0.6.9.win32-py2.7.exe #echo "Follow Microsoft Visual C++ 2008 Redistributable Package setup on screen" #bash winetricks vcrun2008 rm py2exe-0.6.9.win32-py2.7.exe rm python-2.7.msi #rm winetricks # http://bugs.winehq.org/show_bug.cgi?id=3591 mv drive_c/Python27/Lib/site-packages/py2exe/run.exe drive_c/Python27/Lib/site-packages/py2exe/run.exe.backup bspatch drive_c/Python27/Lib/site-packages/py2exe/run.exe.backup drive_c/Python27/Lib/site-packages/py2exe/run.exe "$SCRIPT_DIR/SizeOfImage.patch" mv drive_c/Python27/Lib/site-packages/py2exe/run_w.exe drive_c/Python27/Lib/site-packages/py2exe/run_w.exe.backup bspatch drive_c/Python27/Lib/site-packages/py2exe/run_w.exe.backup drive_c/Python27/Lib/site-packages/py2exe/run_w.exe "$SCRIPT_DIR/SizeOfImage_w.patch" cd - else export WINEPREFIX="$( cd wine-py2exe && pwd )" fi wine "C:\\Python27\\python.exe" "$1" py2exe > "py2exe.log" 2>&1 || true echo '# Copying python27.dll' >> "py2exe.log" cp "$WINEPREFIX/drive_c/windows/system32/python27.dll" build/bdist.win32/winexe/bundle-2.7/ wine "C:\\Python27\\python.exe" "$1" py2exe >> "py2exe.log" 2>&1 youtube-dl/devscripts/make_contributing.py0000755000000000000000000000142512641030331020107 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals import io import optparse import re def main(): parser = optparse.OptionParser(usage='%prog INFILE OUTFILE') options, args = parser.parse_args() if len(args) != 2: parser.error('Expected an input and an output filename') infile, outfile = args with io.open(infile, encoding='utf-8') as inf: readme = inf.read() bug_text = re.search( r'(?s)#\s*BUGS\s*[^\n]*\s*(.*?)#\s*COPYRIGHT', readme).group(1) dev_text = re.search( r'(?s)(#\s*DEVELOPER INSTRUCTIONS.*?)#\s*EMBEDDING YOUTUBE-DL', readme).group(1) out = bug_text + dev_text with io.open(outfile, 'w', encoding='utf-8') as outf: outf.write(out) if __name__ == '__main__': main() youtube-dl/devscripts/zsh-completion.py0000755000000000000000000000254712641030331017364 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals import os from os.path import dirname as dirn import sys sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) import youtube_dl ZSH_COMPLETION_FILE = "youtube-dl.zsh" ZSH_COMPLETION_TEMPLATE = "devscripts/zsh-completion.in" def build_completion(opt_parser): opts = [opt for group in opt_parser.option_groups for opt in group.option_list] opts_file = [opt for opt in opts if opt.metavar == "FILE"] opts_dir = [opt for opt in opts if opt.metavar == "DIR"] fileopts = [] for opt in opts_file: if opt._short_opts: fileopts.extend(opt._short_opts) if opt._long_opts: fileopts.extend(opt._long_opts) diropts = [] for opt in opts_dir: if opt._short_opts: diropts.extend(opt._short_opts) if opt._long_opts: diropts.extend(opt._long_opts) flags = [opt.get_opt_string() for opt in opts] with open(ZSH_COMPLETION_TEMPLATE) as f: template = f.read() template = template.replace("{{fileopts}}", "|".join(fileopts)) template = template.replace("{{diropts}}", "|".join(diropts)) template = template.replace("{{flags}}", " ".join(flags)) with open(ZSH_COMPLETION_FILE, "w") as f: f.write(template) parser = youtube_dl.parseOpts()[0] build_completion(parser) youtube-dl/devscripts/fish-completion.in0000644000000000000000000000020212641030331017446 0ustar rootroot {{commands}} complete --command youtube-dl --arguments ":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" youtube-dl/devscripts/posix-locale.sh0000755000000000000000000000026412641030331016764 0ustar rootroot # source this file in your shell to get a POSIX locale (which will break many programs, but that's kind of the point) export LC_ALL=POSIX export LANG=POSIX export LANGUAGE=POSIX youtube-dl/devscripts/prepare_manpage.py0000644000000000000000000000317012641030331017525 0ustar rootrootfrom __future__ import unicode_literals import io import os.path import sys import re ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) README_FILE = os.path.join(ROOT_DIR, 'README.md') def filter_options(readme): ret = '' in_options = False for line in readme.split('\n'): if line.startswith('# '): if line[2:].startswith('OPTIONS'): in_options = True else: in_options = False if in_options: if line.lstrip().startswith('-'): option, description = re.split(r'\s{2,}', line.lstrip()) split_option = option.split(' ') if not split_option[-1].startswith('-'): # metavar option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]]) # Pandoc's definition_lists. See http://pandoc.org/README.html # for more information. ret += '\n%s\n: %s\n' % (option, description) else: ret += line.lstrip() + '\n' else: ret += line + '\n' return ret with io.open(README_FILE, encoding='utf-8') as f: readme = f.read() PREFIX = '''%YOUTUBE-DL(1) # NAME youtube\-dl \- download videos from youtube.com or other video platforms # SYNOPSIS **youtube-dl** \[OPTIONS\] URL [URL...] ''' readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme) readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme) readme = PREFIX + readme readme = filter_options(readme) if sys.version_info < (3, 0): print(readme.encode('utf-8')) else: print(readme) youtube-dl/devscripts/SizeOfImage_w.patch0000644000000000000000000000022412641030331017533 0ustar rootrootBSDIFF4024DBZh91AY&SYk.DH`@ !`ЊeH  te`BZh91AY&SY֤? 0R~ovrE8P֤?BZh9rE8Pyoutube-dl/devscripts/check-porn.py0000644000000000000000000000360512641030331016433 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals """ This script employs a VERY basic heuristic ('porn' in webpage.lower()) to check if we are not 'age_limit' tagging some porn site A second approach implemented relies on a list of porn domains, to activate it pass the list filename as the only argument """ # Allow direct execution import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import get_testcases from youtube_dl.utils import compat_urllib_parse_urlparse from youtube_dl.utils import compat_urllib_request if len(sys.argv) > 1: METHOD = 'LIST' LIST = open(sys.argv[1]).read().decode('utf8').strip() else: METHOD = 'EURISTIC' for test in get_testcases(): if METHOD == 'EURISTIC': try: webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() except Exception: print('\nFail: {0}'.format(test['name'])) continue webpage = webpage.decode('utf8', 'replace') RESULT = 'porn' in webpage.lower() elif METHOD == 'LIST': domain = compat_urllib_parse_urlparse(test['url']).netloc if not domain: print('\nFail: {0}'.format(test['name'])) continue domain = '.'.join(domain.split('.')[-2:]) RESULT = ('.' + domain + '\n' in LIST or '\n' + domain + '\n' in LIST) if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] or test['info_dict']['age_limit'] != 18): print('\nPotential missing age_limit check: {0}'.format(test['name'])) elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] and test['info_dict']['age_limit'] == 18): print('\nPotential false negative: {0}'.format(test['name'])) else: sys.stdout.write('.') sys.stdout.flush() print() youtube-dl/devscripts/bash-completion.in0000644000000000000000000000151412641030331017441 0ustar rootroot__youtube_dl() { local cur prev opts fileopts diropts keywords COMPREPLY=() cur="${COMP_WORDS[COMP_CWORD]}" prev="${COMP_WORDS[COMP_CWORD-1]}" opts="{{flags}}" keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" fileopts="-a|--batch-file|--download-archive|--cookies|--load-info" diropts="--cache-dir" if [[ ${prev} =~ ${fileopts} ]]; then COMPREPLY=( $(compgen -f -- ${cur}) ) return 0 elif [[ ${prev} =~ ${diropts} ]]; then COMPREPLY=( $(compgen -d -- ${cur}) ) return 0 fi if [[ ${cur} =~ : ]]; then COMPREPLY=( $(compgen -W "${keywords}" -- ${cur}) ) return 0 elif [[ ${cur} == * ]] ; then COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) ) return 0 fi } complete -F __youtube_dl youtube-dl youtube-dl/devscripts/bash-completion.py0000755000000000000000000000154412641030331017471 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals import os from os.path import dirname as dirn import sys sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) import youtube_dl BASH_COMPLETION_FILE = "youtube-dl.bash-completion" BASH_COMPLETION_TEMPLATE = "devscripts/bash-completion.in" def build_completion(opt_parser): opts_flag = [] for group in opt_parser.option_groups: for option in group.option_list: # for every long flag opts_flag.append(option.get_opt_string()) with open(BASH_COMPLETION_TEMPLATE) as f: template = f.read() with open(BASH_COMPLETION_FILE, "w") as f: # just using the special char filled_template = template.replace("{{flags}}", " ".join(opts_flag)) f.write(filled_template) parser = youtube_dl.parseOpts()[0] build_completion(parser) youtube-dl/devscripts/generate_aes_testdata.py0000644000000000000000000000216012641030331020710 0ustar rootrootfrom __future__ import unicode_literals import codecs import subprocess import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.utils import intlist_to_bytes from youtube_dl.aes import aes_encrypt, key_expansion secret_msg = b'Secret message goes here' def hex_str(int_list): return codecs.encode(intlist_to_bytes(int_list), 'hex') def openssl_encode(algo, key, iv): cmd = ['openssl', 'enc', '-e', '-' + algo, '-K', hex_str(key), '-iv', hex_str(iv)] prog = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) out, _ = prog.communicate(secret_msg) return out iv = key = [0x20, 0x15] + 14 * [0] r = openssl_encode('aes-128-cbc', key, iv) print('aes_cbc_decrypt') print(repr(r)) password = key new_key = aes_encrypt(password, key_expansion(password)) r = openssl_encode('aes-128-ctr', new_key, iv) print('aes_decrypt_text 16') print(repr(r)) password = key + 16 * [0] new_key = aes_encrypt(password, key_expansion(password)) * (32 // 16) r = openssl_encode('aes-256-ctr', new_key, iv) print('aes_decrypt_text 32') print(repr(r)) youtube-dl/devscripts/gh-pages/0000755000000000000000000000000012645665720015543 5ustar rootrootyoutube-dl/devscripts/gh-pages/update-feed.py0000755000000000000000000000442312641030331020262 0ustar rootroot#!/usr/bin/env python3 from __future__ import unicode_literals import datetime import io import json import textwrap atom_template = textwrap.dedent("""\ youtube-dl releases https://yt-dl.org/feed/youtube-dl-updates-feed @TIMESTAMP@ @ENTRIES@ """) entry_template = textwrap.dedent(""" https://yt-dl.org/feed/youtube-dl-updates-feed/youtube-dl-@VERSION@ New version @VERSION@
Downloads available at https://yt-dl.org/downloads/@VERSION@/
The youtube-dl maintainers @TIMESTAMP@
""") now = datetime.datetime.now() now_iso = now.isoformat() + 'Z' atom_template = atom_template.replace('@TIMESTAMP@', now_iso) versions_info = json.load(open('update/versions.json')) versions = list(versions_info['versions'].keys()) versions.sort() entries = [] for v in versions: fields = v.split('.') year, month, day = map(int, fields[:3]) faked = 0 patchlevel = 0 while True: try: datetime.date(year, month, day) except ValueError: day -= 1 faked += 1 assert day > 0 continue break if len(fields) >= 4: try: patchlevel = int(fields[3]) except ValueError: patchlevel = 1 timestamp = '%04d-%02d-%02dT00:%02d:%02dZ' % (year, month, day, faked, patchlevel) entry = entry_template.replace('@TIMESTAMP@', timestamp) entry = entry.replace('@VERSION@', v) entries.append(entry) entries_str = textwrap.indent(''.join(entries), '\t') atom_template = atom_template.replace('@ENTRIES@', entries_str) with io.open('update/releases.atom', 'w', encoding='utf-8') as atom_file: atom_file.write(atom_template) youtube-dl/devscripts/gh-pages/update-sites.py0000755000000000000000000000202112641030331020476 0ustar rootroot#!/usr/bin/env python3 from __future__ import unicode_literals import sys import os import textwrap # We must be able to import youtube_dl sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) import youtube_dl def main(): with open('supportedsites.html.in', 'r', encoding='utf-8') as tmplf: template = tmplf.read() ie_htmls = [] for ie in youtube_dl.list_extractors(age_limit=None): ie_html = '{}'.format(ie.IE_NAME) ie_desc = getattr(ie, 'IE_DESC', None) if ie_desc is False: continue elif ie_desc is not None: ie_html += ': {}'.format(ie.IE_DESC) if not ie.working(): ie_html += ' (Currently broken)' ie_htmls.append('
  • {}
  • '.format(ie_html)) template = template.replace('@SITES@', textwrap.indent('\n'.join(ie_htmls), '\t')) with open('supportedsites.html', 'w', encoding='utf-8') as sitesf: sitesf.write(template) if __name__ == '__main__': main() youtube-dl/devscripts/gh-pages/add-version.py0000755000000000000000000000222212641030331020305 0ustar rootroot#!/usr/bin/env python3 from __future__ import unicode_literals import json import sys import hashlib import os.path if len(sys.argv) <= 1: print('Specify the version number as parameter') sys.exit() version = sys.argv[1] with open('update/LATEST_VERSION', 'w') as f: f.write(version) versions_info = json.load(open('update/versions.json')) if 'signature' in versions_info: del versions_info['signature'] new_version = {} filenames = { 'bin': 'youtube-dl', 'exe': 'youtube-dl.exe', 'tar': 'youtube-dl-%s.tar.gz' % version} build_dir = os.path.join('..', '..', 'build', version) for key, filename in filenames.items(): url = 'https://yt-dl.org/downloads/%s/%s' % (version, filename) fn = os.path.join(build_dir, filename) with open(fn, 'rb') as f: data = f.read() if not data: raise ValueError('File %s is empty!' % fn) sha256sum = hashlib.sha256(data).hexdigest() new_version[key] = (url, sha256sum) versions_info['versions'][version] = new_version versions_info['latest'] = version with open('update/versions.json', 'w') as jsonf: json.dump(versions_info, jsonf, indent=4, sort_keys=True) youtube-dl/devscripts/gh-pages/update-copyright.py0000755000000000000000000000114612645665720021412 0ustar rootroot#!/usr/bin/env python # coding: utf-8 from __future__ import with_statement, unicode_literals import datetime import glob import io # For Python 2 compatibility import os import re year = str(datetime.datetime.now().year) for fn in glob.glob('*.html*'): with io.open(fn, encoding='utf-8') as f: content = f.read() newc = re.sub(r'(?PCopyright © 2006-)(?P[0-9]{4})', 'Copyright © 2006-' + year, content) if content != newc: tmpFn = fn + '.part' with io.open(tmpFn, 'wt', encoding='utf-8') as outf: outf.write(newc) os.rename(tmpFn, fn) youtube-dl/devscripts/gh-pages/sign-versions.py0000755000000000000000000000161512641030331020705 0ustar rootroot#!/usr/bin/env python3 from __future__ import unicode_literals, with_statement import rsa import json from binascii import hexlify try: input = raw_input except NameError: pass versions_info = json.load(open('update/versions.json')) if 'signature' in versions_info: del versions_info['signature'] print('Enter the PKCS1 private key, followed by a blank line:') privkey = b'' while True: try: line = input() except EOFError: break if line == '': break privkey += line.encode('ascii') + b'\n' privkey = rsa.PrivateKey.load_pkcs1(privkey) signature = hexlify(rsa.pkcs1.sign(json.dumps(versions_info, sort_keys=True).encode('utf-8'), privkey, 'SHA-256')).decode() print('signature: ' + signature) versions_info['signature'] = signature with open('update/versions.json', 'w') as versionsf: json.dump(versions_info, versionsf, indent=4, sort_keys=True) youtube-dl/devscripts/gh-pages/generate-download.py0000755000000000000000000000235712641030331021502 0ustar rootroot#!/usr/bin/env python3 from __future__ import unicode_literals import hashlib import urllib.request import json versions_info = json.load(open('update/versions.json')) version = versions_info['latest'] URL = versions_info['versions'][version]['bin'][0] data = urllib.request.urlopen(URL).read() # Read template page with open('download.html.in', 'r', encoding='utf-8') as tmplf: template = tmplf.read() md5sum = hashlib.md5(data).hexdigest() sha1sum = hashlib.sha1(data).hexdigest() sha256sum = hashlib.sha256(data).hexdigest() template = template.replace('@PROGRAM_VERSION@', version) template = template.replace('@PROGRAM_URL@', URL) template = template.replace('@PROGRAM_MD5SUM@', md5sum) template = template.replace('@PROGRAM_SHA1SUM@', sha1sum) template = template.replace('@PROGRAM_SHA256SUM@', sha256sum) template = template.replace('@EXE_URL@', versions_info['versions'][version]['exe'][0]) template = template.replace('@EXE_SHA256SUM@', versions_info['versions'][version]['exe'][1]) template = template.replace('@TAR_URL@', versions_info['versions'][version]['tar'][0]) template = template.replace('@TAR_SHA256SUM@', versions_info['versions'][version]['tar'][1]) with open('download.html', 'w', encoding='utf-8') as dlf: dlf.write(template) youtube-dl/devscripts/release.sh0000755000000000000000000000757312641030331016017 0ustar rootroot#!/bin/bash # IMPORTANT: the following assumptions are made # * the GH repo is on the origin remote # * the gh-pages branch is named so locally # * the git config user.signingkey is properly set # You will need # pip install coverage nose rsa # TODO # release notes # make hash on local files set -e skip_tests=true if [ "$1" = '--run-tests' ]; then skip_tests=false shift fi if [ -z "$1" ]; then echo "ERROR: specify version number like this: $0 1994.09.06"; exit 1; fi version="$1" major_version=$(echo "$version" | sed -n 's#^\([0-9]*\.[0-9]*\.[0-9]*\).*#\1#p') if test "$major_version" '!=' "$(date '+%Y.%m.%d')"; then echo "$version does not start with today's date!" exit 1 fi if [ ! -z "`git tag | grep "$version"`" ]; then echo 'ERROR: version already present'; exit 1; fi if [ ! -z "`git status --porcelain | grep -v CHANGELOG`" ]; then echo 'ERROR: the working directory is not clean; commit or stash changes'; exit 1; fi useless_files=$(find youtube_dl -type f -not -name '*.py') if [ ! -z "$useless_files" ]; then echo "ERROR: Non-.py files in youtube_dl: $useless_files"; exit 1; fi if [ ! -f "updates_key.pem" ]; then echo 'ERROR: updates_key.pem missing'; exit 1; fi /bin/echo -e "\n### First of all, testing..." make clean if $skip_tests ; then echo 'SKIPPING TESTS' else nosetests --verbose --with-coverage --cover-package=youtube_dl --cover-html test --stop || exit 1 fi /bin/echo -e "\n### Changing version in version.py..." sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py /bin/echo -e "\n### Committing documentation and youtube_dl/version.py..." make README.md CONTRIBUTING.md supportedsites git add README.md CONTRIBUTING.md docs/supportedsites.md youtube_dl/version.py git commit -m "release $version" /bin/echo -e "\n### Now tagging, signing and pushing..." git tag -s -m "Release $version" "$version" git show "$version" read -p "Is it good, can I push? (y/n) " -n 1 if [[ ! $REPLY =~ ^[Yy]$ ]]; then exit 1; fi echo MASTER=$(git rev-parse --abbrev-ref HEAD) git push origin $MASTER:master git push origin "$version" /bin/echo -e "\n### OK, now it is time to build the binaries..." REV=$(git rev-parse HEAD) make youtube-dl youtube-dl.tar.gz read -p "VM running? (y/n) " -n 1 wget "http://localhost:8142/build/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe mkdir -p "build/$version" mv youtube-dl youtube-dl.exe "build/$version" mv youtube-dl.tar.gz "build/$version/youtube-dl-$version.tar.gz" RELEASE_FILES="youtube-dl youtube-dl.exe youtube-dl-$version.tar.gz" (cd build/$version/ && md5sum $RELEASE_FILES > MD5SUMS) (cd build/$version/ && sha1sum $RELEASE_FILES > SHA1SUMS) (cd build/$version/ && sha256sum $RELEASE_FILES > SHA2-256SUMS) (cd build/$version/ && sha512sum $RELEASE_FILES > SHA2-512SUMS) /bin/echo -e "\n### Signing and uploading the new binaries to yt-dl.org ..." for f in $RELEASE_FILES; do gpg --passphrase-repeat 5 --detach-sig "build/$version/$f"; done scp -r "build/$version" ytdl@yt-dl.org:html/tmp/ ssh ytdl@yt-dl.org "mv html/tmp/$version html/downloads/" ssh ytdl@yt-dl.org "sh html/update_latest.sh $version" /bin/echo -e "\n### Now switching to gh-pages..." git clone --branch gh-pages --single-branch . build/gh-pages ROOT=$(pwd) ( set -e ORIGIN_URL=$(git config --get remote.origin.url) cd build/gh-pages "$ROOT/devscripts/gh-pages/add-version.py" $version "$ROOT/devscripts/gh-pages/update-feed.py" "$ROOT/devscripts/gh-pages/sign-versions.py" < "$ROOT/updates_key.pem" "$ROOT/devscripts/gh-pages/generate-download.py" "$ROOT/devscripts/gh-pages/update-copyright.py" "$ROOT/devscripts/gh-pages/update-sites.py" git add *.html *.html.in update git commit -m "release $version" git push "$ROOT" gh-pages git push "$ORIGIN_URL" gh-pages ) rm -rf build make pypi-files echo "Uploading to PyPi ..." python setup.py sdist bdist_wheel upload make clean /bin/echo -e "\n### DONE!" youtube-dl/devscripts/make_readme.py0000755000000000000000000000122412641030331016632 0ustar rootrootfrom __future__ import unicode_literals import io import sys import re README_FILE = 'README.md' helptext = sys.stdin.read() if isinstance(helptext, bytes): helptext = helptext.decode('utf-8') with io.open(README_FILE, encoding='utf-8') as f: oldreadme = f.read() header = oldreadme[:oldreadme.index('# OPTIONS')] footer = oldreadme[oldreadme.index('# CONFIGURATION'):] options = helptext[helptext.index(' General Options:') + 19:] options = re.sub(r'(?m)^ (\w.+)$', r'## \1', options) options = '# OPTIONS\n' + options + '\n' with io.open(README_FILE, 'w', encoding='utf-8') as f: f.write(header) f.write(options) f.write(footer) youtube-dl/devscripts/fish-completion.py0000755000000000000000000000311512641030331017501 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals import optparse import os from os.path import dirname as dirn import sys sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) import youtube_dl from youtube_dl.utils import shell_quote FISH_COMPLETION_FILE = 'youtube-dl.fish' FISH_COMPLETION_TEMPLATE = 'devscripts/fish-completion.in' EXTRA_ARGS = { 'recode-video': ['--arguments', 'mp4 flv ogg webm mkv', '--exclusive'], # Options that need a file parameter 'download-archive': ['--require-parameter'], 'cookies': ['--require-parameter'], 'load-info': ['--require-parameter'], 'batch-file': ['--require-parameter'], } def build_completion(opt_parser): commands = [] for group in opt_parser.option_groups: for option in group.option_list: long_option = option.get_opt_string().strip('-') complete_cmd = ['complete', '--command', 'youtube-dl', '--long-option', long_option] if option._short_opts: complete_cmd += ['--short-option', option._short_opts[0].strip('-')] if option.help != optparse.SUPPRESS_HELP: complete_cmd += ['--description', option.help] complete_cmd.extend(EXTRA_ARGS.get(long_option, [])) commands.append(shell_quote(complete_cmd)) with open(FISH_COMPLETION_TEMPLATE) as f: template = f.read() filled_template = template.replace('{{commands}}', '\n'.join(commands)) with open(FISH_COMPLETION_FILE, 'w') as f: f.write(filled_template) parser = youtube_dl.parseOpts()[0] build_completion(parser) youtube-dl/devscripts/zsh-completion.in0000644000000000000000000000141112641030331017324 0ustar rootroot#compdef youtube-dl __youtube_dl() { local curcontext="$curcontext" fileopts diropts cur prev typeset -A opt_args fileopts="{{fileopts}}" diropts="{{diropts}}" cur=$words[CURRENT] case $cur in :) _arguments '*: :(::ytfavorites ::ytrecommended ::ytsubscriptions ::ytwatchlater ::ythistory)' ;; *) prev=$words[CURRENT-1] if [[ ${prev} =~ ${fileopts} ]]; then _path_files elif [[ ${prev} =~ ${diropts} ]]; then _path_files -/ elif [[ ${prev} == "--recode-video" ]]; then _arguments '*: :(mp4 flv ogg webm mkv)' else _arguments '*: :({{flags}})' fi ;; esac } __youtube_dlyoutube-dl/devscripts/make_supportedsites.py0000644000000000000000000000220012641030331020462 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals import io import optparse import os import sys # Import youtube_dl ROOT_DIR = os.path.join(os.path.dirname(__file__), '..') sys.path.insert(0, ROOT_DIR) import youtube_dl def main(): parser = optparse.OptionParser(usage='%prog OUTFILE.md') options, args = parser.parse_args() if len(args) != 1: parser.error('Expected an output filename') outfile, = args def gen_ies_md(ies): for ie in ies: ie_md = '**{0}**'.format(ie.IE_NAME) ie_desc = getattr(ie, 'IE_DESC', None) if ie_desc is False: continue if ie_desc is not None: ie_md += ': {0}'.format(ie.IE_DESC) if not ie.working(): ie_md += ' (Currently broken)' yield ie_md ies = sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower()) out = '# Supported sites\n' + ''.join( ' - ' + md + '\n' for md in gen_ies_md(ies)) with io.open(outfile, 'w', encoding='utf-8') as outf: outf.write(out) if __name__ == '__main__': main() youtube-dl/test/0000755000000000000000000000000012662564617012643 5ustar rootrootyoutube-dl/test/test_unicode_literals.py0000644000000000000000000000354612641030331017563 0ustar rootrootfrom __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import io import re rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) IGNORED_FILES = [ 'setup.py', # http://bugs.python.org/issue13943 'conf.py', 'buildserver.py', ] IGNORED_DIRS = [ '.git', '.tox', ] from test.helper import assertRegexpMatches class TestUnicodeLiterals(unittest.TestCase): def test_all_files(self): for dirpath, dirnames, filenames in os.walk(rootDir): for ignore_dir in IGNORED_DIRS: if ignore_dir in dirnames: # If we remove the directory from dirnames os.walk won't # recurse into it dirnames.remove(ignore_dir) for basename in filenames: if not basename.endswith('.py'): continue if basename in IGNORED_FILES: continue fn = os.path.join(dirpath, basename) with io.open(fn, encoding='utf-8') as inf: code = inf.read() if "'" not in code and '"' not in code: continue assertRegexpMatches( self, code, r'(?:(?:#.*?|\s*)\n)*from __future__ import (?:[a-z_]+,\s*)*unicode_literals', 'unicode_literals import missing in %s' % fn) m = re.search(r'(?<=\s)u[\'"](?!\)|,|$)', code) if m is not None: self.assertTrue( m is None, 'u present in %s, around %s' % ( fn, code[m.start() - 10:m.end() + 10])) if __name__ == '__main__': unittest.main() youtube-dl/test/test_compat.py0000644000000000000000000000777112641030331015525 0ustar rootroot#!/usr/bin/env python # coding: utf-8 from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, compat_etree_fromstring, compat_expanduser, compat_shlex_split, compat_str, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, ) class TestCompat(unittest.TestCase): def test_compat_getenv(self): test_str = 'тест' os.environ['YOUTUBE-DL-TEST'] = ( test_str if sys.version_info >= (3, 0) else test_str.encode(get_filesystem_encoding())) self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str) def test_compat_expanduser(self): old_home = os.environ.get('HOME') test_str = 'C:\Documents and Settings\тест\Application Data' os.environ['HOME'] = ( test_str if sys.version_info >= (3, 0) else test_str.encode(get_filesystem_encoding())) self.assertEqual(compat_expanduser('~'), test_str) os.environ['HOME'] = old_home def test_all_present(self): import youtube_dl.compat all_names = youtube_dl.compat.__all__ present_names = set(filter( lambda c: '_' in c and not c.startswith('_'), dir(youtube_dl.compat))) - set(['unicode_literals']) self.assertEqual(all_names, sorted(present_names)) def test_compat_urllib_parse_unquote(self): self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def') self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def') self.assertEqual(compat_urllib_parse_unquote(''), '') self.assertEqual(compat_urllib_parse_unquote('%'), '%') self.assertEqual(compat_urllib_parse_unquote('%%'), '%%') self.assertEqual(compat_urllib_parse_unquote('%%%'), '%%%') self.assertEqual(compat_urllib_parse_unquote('%2F'), '/') self.assertEqual(compat_urllib_parse_unquote('%2f'), '/') self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波') self.assertEqual( compat_urllib_parse_unquote(''' %%a'''), ''' %%a''') self.assertEqual( compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''), '''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''') def test_compat_urllib_parse_unquote_plus(self): self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def') self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def') def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) def test_compat_etree_fromstring(self): xml = ''' foo 中文 spam ''' doc = compat_etree_fromstring(xml.encode('utf-8')) self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) self.assertTrue(isinstance(doc.find('normal').text, compat_str)) self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) if __name__ == '__main__': unittest.main() youtube-dl/test/test_YoutubeDL.py0000644000000000000000000006450112662061715016125 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import copy from test.helper import FakeYDL, assertRegexpMatches from youtube_dl import YoutubeDL from youtube_dl.compat import compat_str, compat_urllib_error from youtube_dl.extractor import YoutubeIE from youtube_dl.extractor.common import InfoExtractor from youtube_dl.postprocessor.common import PostProcessor from youtube_dl.utils import ExtractorError, match_filter_func TEST_URL = 'http://localhost/sample.mp4' class YDL(FakeYDL): def __init__(self, *args, **kwargs): super(YDL, self).__init__(*args, **kwargs) self.downloaded_info_dicts = [] self.msgs = [] def process_info(self, info_dict): self.downloaded_info_dicts.append(info_dict) def to_screen(self, msg): self.msgs.append(msg) def _make_result(formats, **kwargs): res = { 'formats': formats, 'id': 'testid', 'title': 'testttitle', 'extractor': 'testex', } res.update(**kwargs) return res class TestFormatSelection(unittest.TestCase): def test_prefer_free_formats(self): # Same resolution => download webm ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ {'ext': 'webm', 'height': 460, 'url': TEST_URL}, {'ext': 'mp4', 'height': 460, 'url': TEST_URL}, ] info_dict = _make_result(formats) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['ext'], 'webm') # Different resolution => download best quality (mp4) ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ {'ext': 'webm', 'height': 720, 'url': TEST_URL}, {'ext': 'mp4', 'height': 1080, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['ext'], 'mp4') # No prefer_free_formats => prefer mp4 and flv for greater compatibility ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ {'ext': 'webm', 'height': 720, 'url': TEST_URL}, {'ext': 'mp4', 'height': 720, 'url': TEST_URL}, {'ext': 'flv', 'height': 720, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['ext'], 'mp4') ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ {'ext': 'flv', 'height': 720, 'url': TEST_URL}, {'ext': 'webm', 'height': 720, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['ext'], 'flv') def test_format_selection(self): formats = [ {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, {'format_id': 'example-with-dashes', 'ext': 'webm', 'preference': 1, 'url': TEST_URL}, {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': TEST_URL}, {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': TEST_URL}, {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': TEST_URL}, ] info_dict = _make_result(formats) ydl = YDL({'format': '20/47'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '47') ydl = YDL({'format': '20/71/worst'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '35') ydl = YDL() ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '2') ydl = YDL({'format': 'webm/mp4'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '47') ydl = YDL({'format': '3gp/40/mp4'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '35') ydl = YDL({'format': 'example-with-dashes'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'example-with-dashes') def test_format_selection_audio(self): formats = [ {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL}, {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': TEST_URL}, {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': TEST_URL}, {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': TEST_URL}, ] info_dict = _make_result(formats) ydl = YDL({'format': 'bestaudio'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'audio-high') ydl = YDL({'format': 'worstaudio'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'audio-low') formats = [ {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': TEST_URL}, ] info_dict = _make_result(formats) ydl = YDL({'format': 'bestaudio/worstaudio/best'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-high') def test_format_selection_audio_exts(self): formats = [ {'format_id': 'mp3-64', 'ext': 'mp3', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'}, {'format_id': 'ogg-64', 'ext': 'ogg', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'}, {'format_id': 'aac-64', 'ext': 'aac', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'}, {'format_id': 'mp3-32', 'ext': 'mp3', 'abr': 32, 'url': 'http://_', 'vcodec': 'none'}, {'format_id': 'aac-32', 'ext': 'aac', 'abr': 32, 'url': 'http://_', 'vcodec': 'none'}, ] info_dict = _make_result(formats) ydl = YDL({'format': 'best'}) ie = YoutubeIE(ydl) ie._sort_formats(info_dict['formats']) ydl.process_ie_result(copy.deepcopy(info_dict)) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'aac-64') ydl = YDL({'format': 'mp3'}) ie = YoutubeIE(ydl) ie._sort_formats(info_dict['formats']) ydl.process_ie_result(copy.deepcopy(info_dict)) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'mp3-64') ydl = YDL({'prefer_free_formats': True}) ie = YoutubeIE(ydl) ie._sort_formats(info_dict['formats']) ydl.process_ie_result(copy.deepcopy(info_dict)) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'ogg-64') def test_format_selection_video(self): formats = [ {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': TEST_URL}, {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': TEST_URL}, {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': TEST_URL}, ] info_dict = _make_result(formats) ydl = YDL({'format': 'bestvideo'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'dash-video-high') ydl = YDL({'format': 'worstvideo'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'dash-video-low') formats = [ {'format_id': 'vid-vcodec-dot', 'ext': 'mp4', 'preference': 1, 'vcodec': 'avc1.123456', 'acodec': 'none', 'url': TEST_URL}, ] info_dict = _make_result(formats) ydl = YDL({'format': 'bestvideo[vcodec=avc1.123456]'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot') def test_youtube_format_selection(self): order = [ '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13', # Apple HTTP Live Streaming '96', '95', '94', '93', '92', '132', '151', # 3D '85', '84', '102', '83', '101', '82', '100', # Dash video '137', '248', '136', '247', '135', '246', '245', '244', '134', '243', '133', '242', '160', # Dash audio '141', '172', '140', '171', '139', ] def format_info(f_id): info = YoutubeIE._formats[f_id].copy() # XXX: In real cases InfoExtractor._parse_mpd_formats() fills up 'acodec' # and 'vcodec', while in tests such information is incomplete since # commit a6c2c24479e5f4827ceb06f64d855329c0a6f593 # test_YoutubeDL.test_youtube_format_selection is broken without # this fix if 'acodec' in info and 'vcodec' not in info: info['vcodec'] = 'none' elif 'vcodec' in info and 'acodec' not in info: info['acodec'] = 'none' info['format_id'] = f_id info['url'] = 'url:' + f_id return info formats_order = [format_info(f_id) for f_id in order] info_dict = _make_result(list(formats_order), extractor='youtube') ydl = YDL({'format': 'bestvideo+bestaudio'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '137+141') self.assertEqual(downloaded['ext'], 'mp4') info_dict = _make_result(list(formats_order), extractor='youtube') ydl = YDL({'format': 'bestvideo[height>=999999]+bestaudio/best'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '38') info_dict = _make_result(list(formats_order), extractor='youtube') ydl = YDL({'format': 'bestvideo/best,bestaudio'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] self.assertEqual(downloaded_ids, ['137', '141']) info_dict = _make_result(list(formats_order), extractor='youtube') ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] self.assertEqual(downloaded_ids, ['137+141', '248+141']) info_dict = _make_result(list(formats_order), extractor='youtube') ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])[height<=720]+bestaudio'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] self.assertEqual(downloaded_ids, ['136+141', '247+141']) info_dict = _make_result(list(formats_order), extractor='youtube') ydl = YDL({'format': '(bestvideo[ext=none]/bestvideo[ext=webm])+bestaudio'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] self.assertEqual(downloaded_ids, ['248+141']) for f1, f2 in zip(formats_order, formats_order[1:]): info_dict = _make_result([f1, f2], extractor='youtube') ydl = YDL({'format': 'best/bestvideo'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], f1['format_id']) info_dict = _make_result([f2, f1], extractor='youtube') ydl = YDL({'format': 'best/bestvideo'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], f1['format_id']) def test_invalid_format_specs(self): def assert_syntax_error(format_spec): ydl = YDL({'format': format_spec}) info_dict = _make_result([{'format_id': 'foo', 'url': TEST_URL}]) self.assertRaises(SyntaxError, ydl.process_ie_result, info_dict) assert_syntax_error('bestvideo,,best') assert_syntax_error('+bestaudio') assert_syntax_error('bestvideo+') assert_syntax_error('/') def test_format_filtering(self): formats = [ {'format_id': 'A', 'filesize': 500, 'width': 1000}, {'format_id': 'B', 'filesize': 1000, 'width': 500}, {'format_id': 'C', 'filesize': 1000, 'width': 400}, {'format_id': 'D', 'filesize': 2000, 'width': 600}, {'format_id': 'E', 'filesize': 3000}, {'format_id': 'F'}, {'format_id': 'G', 'filesize': 1000000}, ] for f in formats: f['url'] = 'http://_/' f['ext'] = 'unknown' info_dict = _make_result(formats) ydl = YDL({'format': 'best[filesize<3000]'}) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'D') ydl = YDL({'format': 'best[filesize<=3000]'}) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'E') ydl = YDL({'format': 'best[filesize <= ? 3000]'}) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'F') ydl = YDL({'format': 'best [filesize = 1000] [width>450]'}) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'B') ydl = YDL({'format': 'best [filesize = 1000] [width!=450]'}) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'C') ydl = YDL({'format': '[filesize>?1]'}) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'G') ydl = YDL({'format': '[filesize<1M]'}) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'E') ydl = YDL({'format': '[filesize<1MiB]'}) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'G') ydl = YDL({'format': 'all[width>=400][width<=600]'}) ydl.process_ie_result(info_dict) downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] self.assertEqual(downloaded_ids, ['B', 'C', 'D']) ydl = YDL({'format': 'best[height<40]'}) try: ydl.process_ie_result(info_dict) except ExtractorError: pass self.assertEqual(ydl.downloaded_info_dicts, []) class TestYoutubeDL(unittest.TestCase): def test_subtitles(self): def s_formats(lang, autocaption=False): return [{ 'ext': ext, 'url': 'http://localhost/video.%s.%s' % (lang, ext), '_auto': autocaption, } for ext in ['vtt', 'srt', 'ass']] subtitles = dict((l, s_formats(l)) for l in ['en', 'fr', 'es']) auto_captions = dict((l, s_formats(l, True)) for l in ['it', 'pt', 'es']) info_dict = { 'id': 'test', 'title': 'Test', 'url': 'http://localhost/video.mp4', 'subtitles': subtitles, 'automatic_captions': auto_captions, 'extractor': 'TEST', } def get_info(params={}): params.setdefault('simulate', True) ydl = YDL(params) ydl.report_warning = lambda *args, **kargs: None return ydl.process_video_result(info_dict, download=False) result = get_info() self.assertFalse(result.get('requested_subtitles')) self.assertEqual(result['subtitles'], subtitles) self.assertEqual(result['automatic_captions'], auto_captions) result = get_info({'writesubtitles': True}) subs = result['requested_subtitles'] self.assertTrue(subs) self.assertEqual(set(subs.keys()), set(['en'])) self.assertTrue(subs['en'].get('data') is None) self.assertEqual(subs['en']['ext'], 'ass') result = get_info({'writesubtitles': True, 'subtitlesformat': 'foo/srt'}) subs = result['requested_subtitles'] self.assertEqual(subs['en']['ext'], 'srt') result = get_info({'writesubtitles': True, 'subtitleslangs': ['es', 'fr', 'it']}) subs = result['requested_subtitles'] self.assertTrue(subs) self.assertEqual(set(subs.keys()), set(['es', 'fr'])) result = get_info({'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) subs = result['requested_subtitles'] self.assertTrue(subs) self.assertEqual(set(subs.keys()), set(['es', 'pt'])) self.assertFalse(subs['es']['_auto']) self.assertTrue(subs['pt']['_auto']) result = get_info({'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) subs = result['requested_subtitles'] self.assertTrue(subs) self.assertEqual(set(subs.keys()), set(['es', 'pt'])) self.assertTrue(subs['es']['_auto']) self.assertTrue(subs['pt']['_auto']) def test_add_extra_info(self): test_dict = { 'extractor': 'Foo', } extra_info = { 'extractor': 'Bar', 'playlist': 'funny videos', } YDL.add_extra_info(test_dict, extra_info) self.assertEqual(test_dict['extractor'], 'Foo') self.assertEqual(test_dict['playlist'], 'funny videos') def test_prepare_filename(self): info = { 'id': '1234', 'ext': 'mp4', 'width': None, } def fname(templ): ydl = YoutubeDL({'outtmpl': templ}) return ydl.prepare_filename(info) self.assertEqual(fname('%(id)s.%(ext)s'), '1234.mp4') self.assertEqual(fname('%(id)s-%(width)s.%(ext)s'), '1234-NA.mp4') # Replace missing fields with 'NA' self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4') def test_format_note(self): ydl = YoutubeDL() self.assertEqual(ydl._format_note({}), '') assertRegexpMatches(self, ydl._format_note({ 'vbr': 10, }), '^\s*10k$') def test_postprocessors(self): filename = 'post-processor-testfile.mp4' audiofile = filename + '.mp3' class SimplePP(PostProcessor): def run(self, info): with open(audiofile, 'wt') as f: f.write('EXAMPLE') return [info['filepath']], info def run_pp(params, PP): with open(filename, 'wt') as f: f.write('EXAMPLE') ydl = YoutubeDL(params) ydl.add_post_processor(PP()) ydl.post_process(filename, {'filepath': filename}) run_pp({'keepvideo': True}, SimplePP) self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename) self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile) os.unlink(filename) os.unlink(audiofile) run_pp({'keepvideo': False}, SimplePP) self.assertFalse(os.path.exists(filename), '%s exists' % filename) self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile) os.unlink(audiofile) class ModifierPP(PostProcessor): def run(self, info): with open(info['filepath'], 'wt') as f: f.write('MODIFIED') return [], info run_pp({'keepvideo': False}, ModifierPP) self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename) os.unlink(filename) def test_match_filter(self): class FilterYDL(YDL): def __init__(self, *args, **kwargs): super(FilterYDL, self).__init__(*args, **kwargs) self.params['simulate'] = True def process_info(self, info_dict): super(YDL, self).process_info(info_dict) def _match_entry(self, info_dict, incomplete): res = super(FilterYDL, self)._match_entry(info_dict, incomplete) if res is None: self.downloaded_info_dicts.append(info_dict) return res first = { 'id': '1', 'url': TEST_URL, 'title': 'one', 'extractor': 'TEST', 'duration': 30, 'filesize': 10 * 1024, } second = { 'id': '2', 'url': TEST_URL, 'title': 'two', 'extractor': 'TEST', 'duration': 10, 'description': 'foo', 'filesize': 5 * 1024, } videos = [first, second] def get_videos(filter_=None): ydl = FilterYDL({'match_filter': filter_}) for v in videos: ydl.process_ie_result(v, download=True) return [v['id'] for v in ydl.downloaded_info_dicts] res = get_videos() self.assertEqual(res, ['1', '2']) def f(v): if v['id'] == '1': return None else: return 'Video id is not 1' res = get_videos(f) self.assertEqual(res, ['1']) f = match_filter_func('duration < 30') res = get_videos(f) self.assertEqual(res, ['2']) f = match_filter_func('description = foo') res = get_videos(f) self.assertEqual(res, ['2']) f = match_filter_func('description =? foo') res = get_videos(f) self.assertEqual(res, ['1', '2']) f = match_filter_func('filesize > 5KiB') res = get_videos(f) self.assertEqual(res, ['1']) def test_playlist_items_selection(self): entries = [{ 'id': compat_str(i), 'title': compat_str(i), 'url': TEST_URL, } for i in range(1, 5)] playlist = { '_type': 'playlist', 'id': 'test', 'entries': entries, 'extractor': 'test:playlist', 'extractor_key': 'test:playlist', 'webpage_url': 'http://example.com', } def get_ids(params): ydl = YDL(params) # make a copy because the dictionary can be modified ydl.process_ie_result(playlist.copy()) return [int(v['id']) for v in ydl.downloaded_info_dicts] result = get_ids({}) self.assertEqual(result, [1, 2, 3, 4]) result = get_ids({'playlistend': 10}) self.assertEqual(result, [1, 2, 3, 4]) result = get_ids({'playlistend': 2}) self.assertEqual(result, [1, 2]) result = get_ids({'playliststart': 10}) self.assertEqual(result, []) result = get_ids({'playliststart': 2}) self.assertEqual(result, [2, 3, 4]) result = get_ids({'playlist_items': '2-4'}) self.assertEqual(result, [2, 3, 4]) result = get_ids({'playlist_items': '2,4'}) self.assertEqual(result, [2, 4]) result = get_ids({'playlist_items': '10'}) self.assertEqual(result, []) def test_urlopen_no_file_protocol(self): # see https://github.com/rg3/youtube-dl/issues/8227 ydl = YDL() self.assertRaises(compat_urllib_error.URLError, ydl.urlopen, 'file:///etc/passwd') def test_do_not_override_ie_key_in_url_transparent(self): ydl = YDL() class Foo1IE(InfoExtractor): _VALID_URL = r'foo1:' def _real_extract(self, url): return { '_type': 'url_transparent', 'url': 'foo2:', 'ie_key': 'Foo2', } class Foo2IE(InfoExtractor): _VALID_URL = r'foo2:' def _real_extract(self, url): return { '_type': 'url', 'url': 'foo3:', 'ie_key': 'Foo3', } class Foo3IE(InfoExtractor): _VALID_URL = r'foo3:' def _real_extract(self, url): return _make_result([{'url': TEST_URL}]) ydl.add_info_extractor(Foo1IE(ydl)) ydl.add_info_extractor(Foo2IE(ydl)) ydl.add_info_extractor(Foo3IE(ydl)) ydl.extract_info('foo1:') downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['url'], TEST_URL) if __name__ == '__main__': unittest.main() youtube-dl/test/test_cache.py0000644000000000000000000000304712641030331015275 0ustar rootroot#!/usr/bin/env python # coding: utf-8 from __future__ import unicode_literals import shutil # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL from youtube_dl.cache import Cache def _is_empty(d): return not bool(os.listdir(d)) def _mkdir(d): if not os.path.exists(d): os.mkdir(d) class TestCache(unittest.TestCase): def setUp(self): TEST_DIR = os.path.dirname(os.path.abspath(__file__)) TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata') _mkdir(TESTDATA_DIR) self.test_dir = os.path.join(TESTDATA_DIR, 'cache_test') self.tearDown() def tearDown(self): if os.path.exists(self.test_dir): shutil.rmtree(self.test_dir) def test_cache(self): ydl = FakeYDL({ 'cachedir': self.test_dir, }) c = Cache(ydl) obj = {'x': 1, 'y': ['ä', '\\a', True]} self.assertEqual(c.load('test_cache', 'k.'), None) c.store('test_cache', 'k.', obj) self.assertEqual(c.load('test_cache', 'k2'), None) self.assertFalse(_is_empty(self.test_dir)) self.assertEqual(c.load('test_cache', 'k.'), obj) self.assertEqual(c.load('test_cache', 'y'), None) self.assertEqual(c.load('test_cache2', 'k.'), None) c.remove() self.assertFalse(os.path.exists(self.test_dir)) self.assertEqual(c.load('test_cache', 'k.'), None) if __name__ == '__main__': unittest.main() youtube-dl/test/test_download.py0000644000000000000000000002073112641030331016040 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import ( assertGreaterEqual, expect_warnings, get_params, gettestcases, expect_info_dict, try_rm, report_warning, ) import hashlib import io import json import socket import youtube_dl.YoutubeDL from youtube_dl.compat import ( compat_http_client, compat_urllib_error, compat_HTTPError, ) from youtube_dl.utils import ( DownloadError, ExtractorError, format_bytes, UnavailableVideoError, ) from youtube_dl.extractor import get_info_extractor RETRIES = 3 class YoutubeDL(youtube_dl.YoutubeDL): def __init__(self, *args, **kwargs): self.to_stderr = self.to_screen self.processed_info_dicts = [] super(YoutubeDL, self).__init__(*args, **kwargs) def report_warning(self, message): # Don't accept warnings during tests raise ExtractorError(message) def process_info(self, info_dict): self.processed_info_dicts.append(info_dict) return super(YoutubeDL, self).process_info(info_dict) def _file_md5(fn): with open(fn, 'rb') as f: return hashlib.md5(f.read()).hexdigest() defs = gettestcases() class TestDownload(unittest.TestCase): maxDiff = None def setUp(self): self.defs = defs # Dynamically generate tests def generator(test_case): def test_template(self): ie = youtube_dl.extractor.get_info_extractor(test_case['name']) other_ies = [get_info_extractor(ie_key) for ie_key in test_case.get('add_ie', [])] is_playlist = any(k.startswith('playlist') for k in test_case) test_cases = test_case.get( 'playlist', [] if is_playlist else [test_case]) def print_skipping(reason): print('Skipping %s: %s' % (test_case['name'], reason)) if not ie.working(): print_skipping('IE marked as not _WORKING') return for tc in test_cases: info_dict = tc.get('info_dict', {}) if not (info_dict.get('id') and info_dict.get('ext')): raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?') if 'skip' in test_case: print_skipping(test_case['skip']) return for other_ie in other_ies: if not other_ie.working(): print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key()) return params = get_params(test_case.get('params', {})) if is_playlist and 'playlist' not in test_case: params.setdefault('extract_flat', 'in_playlist') params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) ydl.add_default_info_extractors() finished_hook_called = set() def _hook(status): if status['status'] == 'finished': finished_hook_called.add(status['filename']) ydl.add_progress_hook(_hook) expect_warnings(ydl, test_case.get('expected_warnings', [])) def get_tc_filename(tc): return ydl.prepare_filename(tc.get('info_dict', {})) res_dict = None def try_rm_tcs_files(tcs=None): if tcs is None: tcs = test_cases for tc in tcs: tc_filename = get_tc_filename(tc) try_rm(tc_filename) try_rm(tc_filename + '.part') try_rm(os.path.splitext(tc_filename)[0] + '.info.json') try_rm_tcs_files() try: try_num = 1 while True: try: # We're not using .download here sine that is just a shim # for outside error handling, and returns the exit code # instead of the result dict. res_dict = ydl.extract_info( test_case['url'], force_generic_extractor=params.get('force_generic_extractor', False)) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): raise if try_num == RETRIES: report_warning('Failed due to network errors, skipping...') return print('Retrying: {0} failed tries\n\n##########\n\n'.format(try_num)) try_num += 1 else: break if is_playlist: self.assertTrue(res_dict['_type'] in ['playlist', 'multi_video']) self.assertTrue('entries' in res_dict) expect_info_dict(self, res_dict, test_case.get('info_dict', {})) if 'playlist_mincount' in test_case: assertGreaterEqual( self, len(res_dict['entries']), test_case['playlist_mincount'], 'Expected at least %d in playlist %s, but got only %d' % ( test_case['playlist_mincount'], test_case['url'], len(res_dict['entries']))) if 'playlist_count' in test_case: self.assertEqual( len(res_dict['entries']), test_case['playlist_count'], 'Expected %d entries in playlist %s, but got %d.' % ( test_case['playlist_count'], test_case['url'], len(res_dict['entries']), )) if 'playlist_duration_sum' in test_case: got_duration = sum(e['duration'] for e in res_dict['entries']) self.assertEqual( test_case['playlist_duration_sum'], got_duration) for tc in test_cases: tc_filename = get_tc_filename(tc) if not test_case.get('params', {}).get('skip_download', False): self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename) self.assertTrue(tc_filename in finished_hook_called) expected_minsize = tc.get('file_minsize', 10000) if expected_minsize is not None: if params.get('test'): expected_minsize = max(expected_minsize, 10000) got_fsize = os.path.getsize(tc_filename) assertGreaterEqual( self, got_fsize, expected_minsize, 'Expected %s to be at least %s, but it\'s only %s ' % (tc_filename, format_bytes(expected_minsize), format_bytes(got_fsize))) if 'md5' in tc: md5_for_file = _file_md5(tc_filename) self.assertEqual(md5_for_file, tc['md5']) info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json' self.assertTrue( os.path.exists(info_json_fn), 'Missing info file %s' % info_json_fn) with io.open(info_json_fn, encoding='utf-8') as infof: info_dict = json.load(infof) expect_info_dict(self, info_dict, tc.get('info_dict', {})) finally: try_rm_tcs_files() if is_playlist and res_dict is not None and res_dict.get('entries'): # Remove all other files that may have been extracted if the # extractor returns full results even with extract_flat res_tcs = [{'info_dict': e} for e in res_dict['entries']] try_rm_tcs_files(res_tcs) return test_template # And add them to TestDownload for n, test_case in enumerate(defs): test_method = generator(test_case) tname = 'test_' + str(test_case['name']) i = 1 while hasattr(TestDownload, tname): tname = 'test_%s_%d' % (test_case['name'], i) i += 1 test_method.__name__ = str(tname) setattr(TestDownload, test_method.__name__, test_method) del test_method if __name__ == '__main__': unittest.main() youtube-dl/test/test_youtube_lists.py0000644000000000000000000000474312653633132017163 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL from youtube_dl.extractor import ( YoutubePlaylistIE, YoutubeIE, ) class TestYoutubeLists(unittest.TestCase): def assertIsPlaylist(self, info): """Make sure the info has '_type' set to 'playlist'""" self.assertEqual(info['_type'], 'playlist') def test_youtube_playlist_noplaylist(self): dl = FakeYDL() dl.params['noplaylist'] = True ie = YoutubePlaylistIE(dl) result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') self.assertEqual(result['_type'], 'url') self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg') def test_youtube_course(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) # TODO find a > 100 (paginating?) videos course result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') entries = list(result['entries']) self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs') self.assertEqual(len(entries), 25) self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0') def test_youtube_mix(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w') entries = result['entries'] self.assertTrue(len(entries) >= 20) original_video = entries[0] self.assertEqual(original_video['id'], 'OQpdSVF_k_w') def test_youtube_toptracks(self): print('Skipping: The playlist page gives error 500') return dl = FakeYDL() ie = YoutubePlaylistIE(dl) result = ie.extract('https://www.youtube.com/playlist?list=MCUS') entries = result['entries'] self.assertEqual(len(entries), 100) def test_youtube_flat_playlist_titles(self): dl = FakeYDL() dl.params['extract_flat'] = True ie = YoutubePlaylistIE(dl) result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') self.assertIsPlaylist(result) for entry in result['entries']: self.assertTrue(entry.get('title')) if __name__ == '__main__': unittest.main() youtube-dl/test/test_youtube_signature.py0000644000000000000000000000767012641030331020015 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import io import re import string from test.helper import FakeYDL from youtube_dl.extractor import YoutubeIE from youtube_dl.compat import compat_str, compat_urlretrieve _TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', 'js', 86, '>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', 'js', 85, '3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfle-mVwz.js', 'js', 90, ']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl0Cbn9e.js', 'js', 84, 'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVW@YZ!"#$%&\'()*+,-./:;<=', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'js', '2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', 'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js', 'js', 84, '123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQ0STUVWXYZ!"#$%&\'()*+,@./:;<=>' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl9FYC6l.js', 'js', 83, '123456789abcdefghijklmnopqr0tuvwxyzABCDETGHIJKLMNOPQRS>UVWXYZ!"#$%&\'()*+,-./:;<=F' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflCGk6yw/html5player.js', 'js', '4646B5181C6C3020DF1D9C7FCFEA.AD80ABF70C39BD369CCCAE780AFBB98FA6B6CB42766249D9488C288', '82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'js', '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', ) ] class TestSignature(unittest.TestCase): def setUp(self): TEST_DIR = os.path.dirname(os.path.abspath(__file__)) self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata') if not os.path.exists(self.TESTDATA_DIR): os.mkdir(self.TESTDATA_DIR) def make_tfunc(url, stype, sig_input, expected_sig): m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url) assert m, '%r should follow URL format' % url test_id = m.group(1) def test_func(self): basename = 'player-%s.%s' % (test_id, stype) fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): compat_urlretrieve(url, fn) ydl = FakeYDL() ie = YoutubeIE(ydl) if stype == 'js': with io.open(fn, encoding='utf-8') as testf: jscode = testf.read() func = ie._parse_sig_js(jscode) else: assert stype == 'swf' with open(fn, 'rb') as testf: swfcode = testf.read() func = ie._parse_sig_swf(swfcode) src_sig = ( compat_str(string.printable[:sig_input]) if isinstance(sig_input, int) else sig_input) got_sig = func(src_sig) self.assertEqual(got_sig, expected_sig) test_func.__name__ = str('test_signature_' + stype + '_' + test_id) setattr(TestSignature, test_func.__name__, test_func) for test_spec in _TESTS: make_tfunc(*test_spec) if __name__ == '__main__': unittest.main() youtube-dl/test/swftests/0000755000000000000000000000000012641030331014477 5ustar rootrootyoutube-dl/test/swftests/PrivateCall.as0000644000000000000000000000045412641030331017235 0ustar rootroot// input: [] // output: 9 package { public class PrivateCall { public static function main():int{ var f:OtherClass = new OtherClass(); return f.func(); } } } class OtherClass { private function pf():int { return 9; } public function func():int { return this.pf(); } } youtube-dl/test/swftests/NeOperator.as0000644000000000000000000000062512641030331017105 0ustar rootroot// input: [] // output: 123 package { public class NeOperator { public static function main(): int { var res:int = 0; if (1 != 2) { res += 3; } else { res += 4; } if (2 != 2) { res += 10; } else { res += 20; } if (9 == 9) { res += 100; } return res; } } } youtube-dl/test/swftests/PrivateVoidCall.swf0000644000000000000000000000113012660177730020263 0ustar rootrootCWSx]n@c; jP%*eCb;THR,JHƭc[+x6&,x[;N[<39;7 yd~?i}v!Ҿee-vrvvv,gz$:XY'*`s/ S&)k6N n_uhZdQ{"Alʄ4 =&Y'$Yy'X>X5(="wd̓msW_7޸&L4KÞ7-2"-QOxYOԪd3&4y`'}:g;mHu`-Kp:}⨥u\TzSqֻ,<ǴI(B)w`q0{K}4>h^yǒ?Z",r4(bNY`Q D3TAը&P?庮UTPh04 U@  i!*UKL$( h~ j8Bk@LT,Ȍt ,&6cj9Qds!&/<obyoutube-dl/test/swftests/StringCharCodeAt.swf0000644000000000000000000000105412660177742020367 0ustar rootrootCWS9xeQn@q6 HՂɈJ4;*%J- XtlObSdqX76ɂ 0vB\H{Ι{g `h3靍,ô'Ӧy#$v~Ԏ)1.;i F+}r:kæYXj'^̽(ԊZQO͵coL,KK&g yJ!7rƱٴ#VFUN߰ħ'p8Ŵfڱ6%{E)NԼP( q9-2MU).),eWaЂc'__4ftަ.3R14P*Fɍi@F_Ci\^Աذ">Yxy3E'^84Fp[D&  hm^7 5P/Բ'YMqp~D:~BX $a*xK.C";!$ & HXޛʷЁi1BrMg X\gpK7G"youtube-dl/test/swftests/StaticRetrieval.swf0000644000000000000000000000100412660177735020345 0ustar rootrootCWSx]Qn@:?BdJ*DEERYoS W EuJSJ;|73;v#<`x8( !>!eYvם$K][2&ek !vݯ`Rx:Yʫ{ض4yh4=%ѯrA}^yUL R1Zz4-2m-jev4 8ˌEPyszސ7'T5")g,es:Z׷ Qpx&QgvQAMLj7KEQiRŬj~fb**OS&hsq;#̛1r>Xԉ߾P=x-˓m,7lO7>@{>Xn+Q&1v0X2,-D,Բi!4# X/d  ![! pk}zLZ|` DZ]aUb181~ cCwQD3ߑyoutube-dl/test/swftests/ConstantInt.swf0000644000000000000000000000100412660177707017503 0ustar rootrootCWSx]QAn@j UQUH eCb;TUCB]LI֞16+†S qX [4{@~ p`>C</N^ :DRfCǩW=|xqNUnKZwy7cVyXp[c:/z[\&|a܌o?0rδz}YrUK(N>}*7~[?0S$ ܄b7;ۂ ctRCZ "5T!ʃ4& rH#ȊF 4LVFۨ5wm9oTUEd֎6t}Kw*#[ !R;XB28\a KMM5_I0*Cyoutube-dl/test/swftests/ConstArrayAccess.as0000644000000000000000000000053012641030331020231 0ustar rootroot// input: [] // output: 4 package { public class ConstArrayAccess { private static const x:int = 2; private static const ar:Array = ["42", "3411"]; public static function main():int{ var c:ConstArrayAccess = new ConstArrayAccess(); return c.f(); } public function f(): int { return ar[1].length; } } } youtube-dl/test/swftests/.gitignore0000644000000000000000000000000612641030331016463 0ustar rootroot*.swf youtube-dl/test/swftests/PrivateCall.swf0000644000000000000000000000112312660177725017447 0ustar rootrootCWSx]n@gk *BEU%KTʅvUC8PBEHlucؖn x.poQf]ٲ?{3W5q;A; $NF;Y϶igGm;]m#.>$IqUQy(M,QZʝV5/M2+E,&"v\4 ^.=eqseg8DØ]FF2n5ڲv>Cԍz crUӉiPSVUbQc{erӳOԪb\p^%.4yrT#=~Q.G.7#ݺouYJ `aO;===lRܰ73Bu>72Nm1| 5 D E7F—}S&lniY2لGqQ `$IIIuS7MJcj@}ȵXhC m004ht %PcYB(s :[s-{shS4#ܙ?h3f)]&RC]WLsG;꼣ac]1CdyU aޛ youtube-dl/test/swftests/MemberAssignment.swf0000644000000000000000000000105612660177721020502 0ustar rootrootCWSHxeQn@3.J* $K,!q*DE PWĭ_uX$,[_mJs9sf /{Ý{~+3 G, 'TgeY6G$nk:vӐF.lֈu2͂TIlB{eYXYzCXvٖF,bai.SvOd,5QiD B ḓQg־5X+:sLnix+3Ҫ_(A>SgqRXf_%!'pwUWΈCj?^Ph߾҇gr<"r†#9+y6`(?coC J $E&x3>sO#.0Q3HMM4L4MP)rF65!:j %֋M5 #4DQJd>ᣄ%%H#]j$q8!O5 U{' youtube-dl/test/swftests/StaticAssignment.as0000644000000000000000000000027012641030331020303 0ustar rootroot// input: [1] // output: 1 package { public class StaticAssignment { public static var v:int; public static function main(a:int):int{ v = a; return v; } } } youtube-dl/test/swftests/DictCall.swf0000644000000000000000000000104112660177711016712 0ustar rootrootCWS x]Qn@I$U$$#4;QC He'~cwů`ؤ+IHs]q;%; s*֔t`O᪍F-l#8VNX^XOzO~8֞x8f_ B}3{mѼy!*Vlv 2sѼvIp5D0A U4 )"jTj"RDH!"!$a + &q6.]K!P}'ny: nŗr4dRld`X;;;Y7*2q$*vn#Q9y?Je:q斮cP E$SêYh:M/N\|4 gvƴs4D y귌1@nq_B(]j_&5'$vU.,,F~Hqtœ#hqe/!c>Wy"kR]/[rݴЖqshCgk====(1[>@?)Q*Iێ#| KPX5 +W{H)剚g!"ڨ+[ z87[F48ei/&Lµ{(F%P0(:QuLIA'LBdH O!LC/(2(ńyCu꡶Y\zK}@Y'8* Xi+z%23UC @B3S@=[kYiu1vt youtube-dl/test/swftests/StringFunctions.swf0000644000000000000000000000100712432263447020373 0ustar rootrootCWSx]Qn0 ہmڄ)^&)P.bv\Ns K(qrY /#oPtc">[p#>}x LErn$e>r~Vo8: zJ+ I=Q>msg5e$fT9^e좦&Z=2 li}DgR,u" *f UTkiy5K2_ իkTkXW4P1N[ng٥=>wCc߾B}lcxj,˳M6lΠ=_||6lE,J0Q·WM׎;c!n͗IX`:cf\F`ZȰPBi[#Xd  u!;! ܸdq̟? !19EcUpsDvu/{/!youtube-dl/test/swftests/EqualsOperator.as0000644000000000000000000000022312641030331017767 0ustar rootroot// input: [] // output: false package { public class EqualsOperator { public static function main():Boolean{ return 1 == 2; } } } youtube-dl/test/swftests/PrivateVoidCall.as0000644000000000000000000000051112641030331020051 0ustar rootroot// input: [] // output: 9 package { public class PrivateVoidCall { public static function main():int{ var f:OtherClass = new OtherClass(); f.func(); return 9; } } } class OtherClass { private function pf():void { ; } public function func():void { this.pf(); } } youtube-dl/test/swftests/ClassConstruction.as0000644000000000000000000000026512641030331020507 0ustar rootroot// input: [] // output: 0 package { public class ClassConstruction { public static function main():int{ var f:Foo = new Foo(); return 0; } } } class Foo { } youtube-dl/test/swftests/LocalVars.as0000644000000000000000000000034212641030331016711 0ustar rootroot// input: [1, 2] // output: 3 package { public class LocalVars { public static function main(a:int, b:int):int{ var c:int = a + b + b; var d:int = c - b; var e:int = d; return e; } } } youtube-dl/test/swftests/ClassConstruction.swf0000644000000000000000000000105112660177702020714 0ustar rootrootCWS_xeQn@۴ШP%KTʆ*DE B:'1+6&B`N۴ddk|y9w0ix46q}Y!ҾeeiM,Yvv;?&;I#{Y'FU /a}{iEY,b1KDn9#|?YLK4 =ZYNpﬤXg<Xkb{ 3Z_W쯃WI+J34VHI"9Kxź[qQ3.zdVs˺Hslb800T6K/'P:=טyLU;M;`VD46ڛ{ <h|K4L&jaZh:RtuDtHX6- x(H0 Dͅ"܇!j {$J~b@F{nqΞ? >youtube-dl/test/swftests/StringCharCodeAt.as0000644000000000000000000000031612641030331020150 0ustar rootroot// input: [] // output: 9897 package { public class StringCharCodeAt { public static function main():int{ var s:String = "abc"; return s.charCodeAt(1) * 100 + s.charCodeAt(); } } } youtube-dl/test/swftests/StringConversion.as0000644000000000000000000000026212641030331020340 0ustar rootroot// input: [] // output: 2 package { public class StringConversion { public static function main():int{ var s:String = String(99); return s.length; } } } youtube-dl/test/swftests/ClassCall.swf0000644000000000000000000000110512660177677017110 0ustar rootrootCWSvx]n@&YTRT%"eCb;%UBYP+T щ=IL۲$Qʆ` E;m2e{s|=` @`>V#n;gAґW u c>7G0V6̖j5$H>-An8 / LQ^~:(Op~Rszd[ jo&M'Q#g˨ܼm"t{ɔv\|1N̙` {|[Ǐi=^U}Sh`/BV2'ʛ!_o?+6&}u'+qf܂]hfl TOG#ʧBa^=,YCi>*h"Q4$=- fa`LJ0PR Dc*5H%K"|C_RD^PlKs$H 4\Е$;*pA NDv*Ŗ>Ryoutube-dl/test/swftests/EqualsOperator.swf0000644000000000000000000000077712660177714020223 0ustar rootrootCWSx]Qn@:1PZr@D\UDE! pRO\7۵kN| .>_]4mWZi{ofw``>u~ QM'=OyVUvЍ̇uOQ̈7 '~O1zLy/+w`ʰHr9 D%K(2ͫ7QHg)dI{Ϸ͖eN\CDJ&Qt)a:tB:ZQe D4!dET m #D'Z cRZ6<,Ԛm6{8qNU5Ax GD|ej18"yGXu9/youtube-dl/test/swftests/DictCall.as0000644000000000000000000000024012641030331016477 0ustar rootroot// input: [{"x": 1, "y": 2}] // output: 3 package { public class DictCall { public static function main(d:Object):int{ return d.x + d.y; } } } youtube-dl/test/swftests/StaticRetrieval.as0000644000000000000000000000034212641030331020130 0ustar rootroot// input: [] // output: 1 package { public class StaticRetrieval { public static var v:int; public static function main():int{ if (v) { return 0; } else { return 1; } } } } youtube-dl/test/test_jsinterp.py0000644000000000000000000000702112641030331016064 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.jsinterp import JSInterpreter class TestJSInterpreter(unittest.TestCase): def test_basic(self): jsi = JSInterpreter('function x(){;}') self.assertEqual(jsi.call_function('x'), None) jsi = JSInterpreter('function x3(){return 42;}') self.assertEqual(jsi.call_function('x3'), 42) jsi = JSInterpreter('var x5 = function(){return 42;}') self.assertEqual(jsi.call_function('x5'), 42) def test_calc(self): jsi = JSInterpreter('function x4(a){return 2*a+1;}') self.assertEqual(jsi.call_function('x4', 3), 7) def test_empty_return(self): jsi = JSInterpreter('function f(){return; y()}') self.assertEqual(jsi.call_function('f'), None) def test_morespace(self): jsi = JSInterpreter('function x (a) { return 2 * a + 1 ; }') self.assertEqual(jsi.call_function('x', 3), 7) jsi = JSInterpreter('function f () { x = 2 ; return x; }') self.assertEqual(jsi.call_function('f'), 2) def test_strange_chars(self): jsi = JSInterpreter('function $_xY1 ($_axY1) { var $_axY2 = $_axY1 + 1; return $_axY2; }') self.assertEqual(jsi.call_function('$_xY1', 20), 21) def test_operators(self): jsi = JSInterpreter('function f(){return 1 << 5;}') self.assertEqual(jsi.call_function('f'), 32) jsi = JSInterpreter('function f(){return 19 & 21;}') self.assertEqual(jsi.call_function('f'), 17) jsi = JSInterpreter('function f(){return 11 >> 2;}') self.assertEqual(jsi.call_function('f'), 2) def test_array_access(self): jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2] = 7; return x;}') self.assertEqual(jsi.call_function('f'), [5, 2, 7]) def test_parens(self): jsi = JSInterpreter('function f(){return (1) + (2) * ((( (( (((((3)))))) )) ));}') self.assertEqual(jsi.call_function('f'), 7) jsi = JSInterpreter('function f(){return (1 + 2) * 3;}') self.assertEqual(jsi.call_function('f'), 9) def test_assignments(self): jsi = JSInterpreter('function f(){var x = 20; x = 30 + 1; return x;}') self.assertEqual(jsi.call_function('f'), 31) jsi = JSInterpreter('function f(){var x = 20; x += 30 + 1; return x;}') self.assertEqual(jsi.call_function('f'), 51) jsi = JSInterpreter('function f(){var x = 20; x -= 30 + 1; return x;}') self.assertEqual(jsi.call_function('f'), -11) def test_comments(self): 'Skipping: Not yet fully implemented' return jsi = JSInterpreter(''' function x() { var x = /* 1 + */ 2; var y = /* 30 * 40 */ 50; return x + y; } ''') self.assertEqual(jsi.call_function('x'), 52) jsi = JSInterpreter(''' function f() { var x = "/*"; var y = 1 /* comment */ + 2; return y; } ''') self.assertEqual(jsi.call_function('f'), 3) def test_precedence(self): jsi = JSInterpreter(''' function x() { var a = [10, 20, 30, 40, 50]; var b = 6; a[0]=a[b%a.length]; return a; }''') self.assertEqual(jsi.call_function('x'), [20, 20, 30, 40, 50]) if __name__ == '__main__': unittest.main() youtube-dl/test/test_execution.py0000644000000000000000000000245012641030331016232 0ustar rootroot#!/usr/bin/env python # coding: utf-8 from __future__ import unicode_literals import unittest import sys import os import subprocess sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.utils import encodeArgument rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) try: _DEV_NULL = subprocess.DEVNULL except AttributeError: _DEV_NULL = open(os.devnull, 'wb') class TestExecution(unittest.TestCase): def test_import(self): subprocess.check_call([sys.executable, '-c', 'import youtube_dl'], cwd=rootDir) def test_module_exec(self): if sys.version_info >= (2, 7): # Python 2.6 doesn't support package execution subprocess.check_call([sys.executable, '-m', 'youtube_dl', '--version'], cwd=rootDir, stdout=_DEV_NULL) def test_main_exec(self): subprocess.check_call([sys.executable, 'youtube_dl/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL) def test_cmdline_umlauts(self): p = subprocess.Popen( [sys.executable, 'youtube_dl/__main__.py', encodeArgument('ä'), '--version'], cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE) _, stderr = p.communicate() self.assertFalse(stderr) if __name__ == '__main__': unittest.main() youtube-dl/test/test_iqiyi_sdk_interpreter.py0000644000000000000000000000211712662564617020665 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL from youtube_dl.extractor import IqiyiIE class IqiyiIEWithCredentials(IqiyiIE): def _get_login_info(self): return 'foo', 'bar' class WarningLogger(object): def __init__(self): self.messages = [] def warning(self, msg): self.messages.append(msg) def debug(self, msg): pass def error(self, msg): pass class TestIqiyiSDKInterpreter(unittest.TestCase): def test_iqiyi_sdk_interpreter(self): ''' Test the functionality of IqiyiSDKInterpreter by trying to log in If `sign` is incorrect, /validate call throws an HTTP 556 error ''' logger = WarningLogger() ie = IqiyiIEWithCredentials(FakeYDL({'logger': logger})) ie._login() self.assertTrue('unable to log in:' in logger.messages[0]) if __name__ == '__main__': unittest.main() youtube-dl/test/test_InfoExtractor.py0000644000000000000000000000536312641030331017024 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor class TestIE(InfoExtractor): pass class TestInfoExtractor(unittest.TestCase): def setUp(self): self.ie = TestIE(FakeYDL()) def test_ie_key(self): self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE) def test_html_search_regex(self): html = '

    Watch this video

    ' search = lambda re, *args: self.ie._html_search_regex(re, html, *args) self.assertEqual(search(r'

    (.+?)

    ', 'foo'), 'Watch this video') def test_opengraph(self): ie = self.ie html = ''' ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') self.assertEqual(ie._og_search_video_url(html, default=None), None) self.assertEqual(ie._og_search_property('foobar', html), 'Foo') self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar') self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar') def test_html_search_meta(self): ie = self.ie html = ''' ''' self.assertEqual(ie._html_search_meta('a', html), '1') self.assertEqual(ie._html_search_meta('b', html), '2') self.assertEqual(ie._html_search_meta('c', html), '3') self.assertEqual(ie._html_search_meta('d', html), '4') self.assertEqual(ie._html_search_meta('e', html), '5') self.assertEqual(ie._html_search_meta('f', html), '6') if __name__ == '__main__': unittest.main() youtube-dl/test/test_postprocessors.py0000644000000000000000000000075512641030331017345 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.postprocessor import MetadataFromTitlePP class TestMetadataFromTitle(unittest.TestCase): def test_format_to_regex(self): pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') self.assertEqual(pp._titleregex, '(?P.+)\ \-\ (?P<artist>.+)') �������������������youtube-dl/test/test_swfinterp.py�������������������������������������������������������������������0000644�0000000�0000000�00000004260�12641030331�016251� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import errno import io import json import re import subprocess from youtube_dl.swfinterp import SWFInterpreter TEST_DIR = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'swftests') class TestSWFInterpreter(unittest.TestCase): pass def _make_testfunc(testfile): m = re.match(r'^(.*)\.(as)$', testfile) if not m: return test_id = m.group(1) def test_func(self): as_file = os.path.join(TEST_DIR, testfile) swf_file = os.path.join(TEST_DIR, test_id + '.swf') if ((not os.path.exists(swf_file)) or os.path.getmtime(swf_file) < os.path.getmtime(as_file)): # Recompile try: subprocess.check_call([ 'mxmlc', '-output', swf_file, '-static-link-runtime-shared-libraries', as_file]) except OSError as ose: if ose.errno == errno.ENOENT: print('mxmlc not found! Skipping test.') return raise with open(swf_file, 'rb') as swf_f: swf_content = swf_f.read() swfi = SWFInterpreter(swf_content) with io.open(as_file, 'r', encoding='utf-8') as as_f: as_content = as_f.read() def _find_spec(key): m = re.search( r'(?m)^//\s*%s:\s*(.*?)\n' % re.escape(key), as_content) if not m: raise ValueError('Cannot find %s in %s' % (key, testfile)) return json.loads(m.group(1)) input_args = _find_spec('input') output = _find_spec('output') swf_class = swfi.extract_class(test_id) func = swfi.extract_function(swf_class, 'main') res = func(input_args) self.assertEqual(res, output) test_func.__name__ = str('test_swf_' + test_id) setattr(TestSWFInterpreter, test_func.__name__, test_func) for testfile in os.listdir(TEST_DIR): _make_testfunc(testfile) if __name__ == '__main__': unittest.main() ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/test_http.py������������������������������������������������������������������������0000644�0000000�0000000�00000010112�12641030331�015200� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl import threading TEST_DIR = os.path.dirname(os.path.abspath(__file__)) class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): def log_message(self, format, *args): pass def do_GET(self): if self.path == '/video.html': self.send_response(200) self.send_header('Content-Type', 'text/html; charset=utf-8') self.end_headers() self.wfile.write(b'<html><video src="/vid.mp4" /></html>') elif self.path == '/vid.mp4': self.send_response(200) self.send_header('Content-Type', 'video/mp4') self.end_headers() self.wfile.write(b'\x00\x00\x00\x00\x20\x66\x74[video]') else: assert False class FakeLogger(object): def debug(self, msg): pass def warning(self, msg): pass def error(self, msg): pass class TestHTTP(unittest.TestCase): def setUp(self): certfn = os.path.join(TEST_DIR, 'testcert.pem') self.httpd = compat_http_server.HTTPServer( ('localhost', 0), HTTPTestRequestHandler) self.httpd.socket = ssl.wrap_socket( self.httpd.socket, certfile=certfn, server_side=True) self.port = self.httpd.socket.getsockname()[1] self.server_thread = threading.Thread(target=self.httpd.serve_forever) self.server_thread.daemon = True self.server_thread.start() def test_nocheckcertificate(self): if sys.version_info >= (2, 7, 9): # No certificate checking anyways ydl = YoutubeDL({'logger': FakeLogger()}) self.assertRaises( Exception, ydl.extract_info, 'https://localhost:%d/video.html' % self.port) ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True}) r = ydl.extract_info('https://localhost:%d/video.html' % self.port) self.assertEqual(r['url'], 'https://localhost:%d/vid.mp4' % self.port) def _build_proxy_handler(name): class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): proxy_name = name def log_message(self, format, *args): pass def do_GET(self): self.send_response(200) self.send_header('Content-Type', 'text/plain; charset=utf-8') self.end_headers() self.wfile.write('{self.proxy_name}: {self.path}'.format(self=self).encode('utf-8')) return HTTPTestRequestHandler class TestProxy(unittest.TestCase): def setUp(self): self.proxy = compat_http_server.HTTPServer( ('localhost', 0), _build_proxy_handler('normal')) self.port = self.proxy.socket.getsockname()[1] self.proxy_thread = threading.Thread(target=self.proxy.serve_forever) self.proxy_thread.daemon = True self.proxy_thread.start() self.cn_proxy = compat_http_server.HTTPServer( ('localhost', 0), _build_proxy_handler('cn')) self.cn_port = self.cn_proxy.socket.getsockname()[1] self.cn_proxy_thread = threading.Thread(target=self.cn_proxy.serve_forever) self.cn_proxy_thread.daemon = True self.cn_proxy_thread.start() def test_proxy(self): cn_proxy = 'localhost:{0}'.format(self.cn_port) ydl = YoutubeDL({ 'proxy': 'localhost:{0}'.format(self.port), 'cn_verification_proxy': cn_proxy, }) url = 'http://foo.com/bar' response = ydl.urlopen(url).read().decode('utf-8') self.assertEqual(response, 'normal: {0}'.format(url)) req = compat_urllib_request.Request(url) req.add_header('Ytdl-request-proxy', cn_proxy) response = ydl.urlopen(req).read().decode('utf-8') self.assertEqual(response, 'cn: {0}'.format(url)) if __name__ == '__main__': unittest.main() ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/__init__.py�������������������������������������������������������������������������0000644�0000000�0000000�00000000000�12641030331�014714� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/test_write_annotations.py�����������������������������������������������������������0000644�0000000�0000000�00000004763�12645665720�020033� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python # coding: utf-8 from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import get_params, try_rm import io import xml.etree.ElementTree import youtube_dl.YoutubeDL import youtube_dl.extractor class YoutubeDL(youtube_dl.YoutubeDL): def __init__(self, *args, **kwargs): super(YoutubeDL, self).__init__(*args, **kwargs) self.to_stderr = self.to_screen params = get_params({ 'writeannotations': True, 'skip_download': True, 'writeinfojson': False, 'format': 'flv', }) TEST_ID = 'gr51aVj-mLg' ANNOTATIONS_FILE = TEST_ID + '.annotations.xml' EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label'] class TestAnnotations(unittest.TestCase): def setUp(self): # Clear old files self.tearDown() def test_info_json(self): expected = list(EXPECTED_ANNOTATIONS) # Two annotations could have the same text. ie = youtube_dl.extractor.YoutubeIE() ydl = YoutubeDL(params) ydl.add_info_extractor(ie) ydl.download([TEST_ID]) self.assertTrue(os.path.exists(ANNOTATIONS_FILE)) annoxml = None with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof: annoxml = xml.etree.ElementTree.parse(annof) self.assertTrue(annoxml is not None, 'Failed to parse annotations XML') root = annoxml.getroot() self.assertEqual(root.tag, 'document') annotationsTag = root.find('annotations') self.assertEqual(annotationsTag.tag, 'annotations') annotations = annotationsTag.findall('annotation') # Not all the annotations have TEXT children and the annotations are returned unsorted. for a in annotations: self.assertEqual(a.tag, 'annotation') if a.get('type') == 'text': textTag = a.find('TEXT') text = textTag.text self.assertTrue(text in expected) # assertIn only added in python 2.7 # remove the first occurrence, there could be more than one annotation with the same text expected.remove(text) # We should have seen (and removed) all the expected annotation texts. self.assertEqual(len(expected), 0, 'Not all expected annotations were found.') def tearDown(self): try_rm(ANNOTATIONS_FILE) if __name__ == '__main__': unittest.main() �������������youtube-dl/test/test_subtitles.py�������������������������������������������������������������������0000644�0000000�0000000�00000032635�12654643170�016274� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL, md5 from youtube_dl.extractor import ( YoutubeIE, DailymotionIE, TEDIE, VimeoIE, WallaIE, CeskaTelevizeIE, LyndaIE, NPOIE, ComedyCentralIE, NRKTVIE, RaiTVIE, VikiIE, ThePlatformIE, ThePlatformFeedIE, RTVEALaCartaIE, FunnyOrDieIE, DemocracynowIE, ) class BaseTestSubtitles(unittest.TestCase): url = None IE = None def setUp(self): self.DL = FakeYDL() self.ie = self.IE() self.DL.add_info_extractor(self.ie) def getInfoDict(self): info_dict = self.DL.extract_info(self.url, download=False) return info_dict def getSubtitles(self): info_dict = self.getInfoDict() subtitles = info_dict['requested_subtitles'] if not subtitles: return subtitles for sub_info in subtitles.values(): if sub_info.get('data') is None: uf = self.DL.urlopen(sub_info['url']) sub_info['data'] = uf.read().decode('utf-8') return dict((l, sub_info['data']) for l, sub_info in subtitles.items()) class TestYoutubeSubtitles(BaseTestSubtitles): url = 'QRS8MkLhQmM' IE = YoutubeIE def test_youtube_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') self.assertEqual(md5(subtitles['it']), '6d752b98c31f1cf8d597050c7a2cb4b5') for lang in ['fr', 'de']: self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) def test_youtube_subtitles_ttml_format(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'ttml' subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), 'e306f8c42842f723447d9f63ad65df54') def test_youtube_subtitles_vtt_format(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'vtt' subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') def test_youtube_automatic_captions(self): self.url = '8YoUxe5ncPo' self.DL.params['writeautomaticsub'] = True self.DL.params['subtitleslangs'] = ['it'] subtitles = self.getSubtitles() self.assertTrue(subtitles['it'] is not None) def test_youtube_translated_subtitles(self): # This video has a subtitles track, which can be translated self.url = 'Ky9eprVWzlI' self.DL.params['writeautomaticsub'] = True self.DL.params['subtitleslangs'] = ['it'] subtitles = self.getSubtitles() self.assertTrue(subtitles['it'] is not None) def test_youtube_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') self.url = 'n5BB19UTcdA' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertFalse(subtitles) class TestDailymotionSubtitles(BaseTestSubtitles): url = 'http://www.dailymotion.com/video/xczg00' IE = DailymotionIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) >= 6) self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') for lang in ['es', 'fr', 'de']: self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertFalse(subtitles) class TestTedSubtitles(BaseTestSubtitles): url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' IE = TEDIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) >= 28) self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14') self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5') for lang in ['es', 'fr', 'de']: self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) class TestVimeoSubtitles(BaseTestSubtitles): url = 'http://vimeo.com/76979871' IE = VimeoIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') self.url = 'http://vimeo.com/56015672' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertFalse(subtitles) class TestWallaSubtitles(BaseTestSubtitles): url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' IE = WallaIE def test_allsubtitles(self): self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['heb'])) self.assertEqual(md5(subtitles['heb']), 'e758c5d7cb982f6bef14f377ec7a3920') def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') self.url = 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertFalse(subtitles) class TestCeskaTelevizeSubtitles(BaseTestSubtitles): url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' IE = CeskaTelevizeIE def test_allsubtitles(self): self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['cs'])) self.assertTrue(len(subtitles['cs']) > 20000) def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') self.url = 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertFalse(subtitles) class TestLyndaSubtitles(BaseTestSubtitles): url = 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html' IE = LyndaIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') class TestNPOSubtitles(BaseTestSubtitles): url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860' IE = NPOIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['nl'])) self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4') class TestMTVSubtitles(BaseTestSubtitles): url = 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother' IE = ComedyCentralIE def getInfoDict(self): return super(TestMTVSubtitles, self).getInfoDict()['entries'][0] def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65') class TestNRKSubtitles(BaseTestSubtitles): url = 'http://tv.nrk.no/serie/ikke-gjoer-dette-hjemme/DMPV73000411/sesong-2/episode-1' IE = NRKTVIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['no'])) self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2') class TestRaiSubtitles(BaseTestSubtitles): url = 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html' IE = RaiTVIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['it'])) self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') class TestVikiSubtitles(BaseTestSubtitles): url = 'http://www.viki.com/videos/1060846v-punch-episode-18' IE = VikiIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) self.assertEqual(md5(subtitles['en']), '53cb083a5914b2d84ef1ab67b880d18a') class TestThePlatformSubtitles(BaseTestSubtitles): # from http://www.3playmedia.com/services-features/tools/integrations/theplatform/ # (see http://theplatform.com/about/partners/type/subtitles-closed-captioning/) url = 'theplatform:JFUjUE1_ehvq' IE = ThePlatformIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') class TestThePlatformFeedSubtitles(BaseTestSubtitles): url = 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207' IE = ThePlatformFeedIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) self.assertEqual(md5(subtitles['en']), '48649a22e82b2da21c9a67a395eedade') class TestRtveSubtitles(BaseTestSubtitles): url = 'http://www.rtve.es/alacarta/videos/los-misterios-de-laura/misterios-laura-capitulo-32-misterio-del-numero-17-2-parte/2428621/' IE = RTVEALaCartaIE def test_allsubtitles(self): print('Skipping, only available from Spain') return self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['es'])) self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') class TestFunnyOrDieSubtitles(BaseTestSubtitles): url = 'http://www.funnyordie.com/videos/224829ff6d/judd-apatow-will-direct-your-vine' IE = FunnyOrDieIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4') class TestDemocracynowSubtitles(BaseTestSubtitles): url = 'http://www.democracynow.org/shows/2015/7/3' IE = DemocracynowIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') def test_subtitles_in_page(self): self.url = 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') if __name__ == '__main__': unittest.main() ���������������������������������������������������������������������������������������������������youtube-dl/test/versions.json�����������������������������������������������������������������������0000644�0000000�0000000�00000003110�12650650456�015373� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������{ "latest": "2013.01.06", "signature": "72158cdba391628569ffdbea259afbcf279bbe3d8aeb7492690735dc1cfa6afa754f55c61196f3871d429599ab22f2667f1fec98865527b32632e7f4b3675a7ef0f0fbe084d359256ae4bba68f0d33854e531a70754712f244be71d4b92e664302aa99653ee4df19800d955b6c4149cd2b3f24288d6e4b40b16126e01f4c8ce6", "versions": { "2013.01.02": { "bin": [ "http://youtube-dl.org/downloads/2013.01.02/youtube-dl", "f5b502f8aaa77675c4884938b1e4871ebca2611813a0c0e74f60c0fbd6dcca6b" ], "exe": [ "http://youtube-dl.org/downloads/2013.01.02/youtube-dl.exe", "75fa89d2ce297d102ff27675aa9d92545bbc91013f52ec52868c069f4f9f0422" ], "tar": [ "http://youtube-dl.org/downloads/2013.01.02/youtube-dl-2013.01.02.tar.gz", "6a66d022ac8e1c13da284036288a133ec8dba003b7bd3a5179d0c0daca8c8196" ] }, "2013.01.06": { "bin": [ "http://youtube-dl.org/downloads/2013.01.06/youtube-dl", "64b6ed8865735c6302e836d4d832577321b4519aa02640dc508580c1ee824049" ], "exe": [ "http://youtube-dl.org/downloads/2013.01.06/youtube-dl.exe", "58609baf91e4389d36e3ba586e21dab882daaaee537e4448b1265392ae86ff84" ], "tar": [ "http://youtube-dl.org/downloads/2013.01.06/youtube-dl-2013.01.06.tar.gz", "fe77ab20a95d980ed17a659aa67e371fdd4d656d19c4c7950e7b720b0c2f1a86" ] } } }��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/parameters.json���������������������������������������������������������������������0000644�0000000�0000000�00000002065�12641030331�015656� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������{ "consoletitle": false, "continuedl": true, "forcedescription": false, "forcefilename": false, "forceformat": false, "forcethumbnail": false, "forcetitle": false, "forceurl": false, "format": "best", "ignoreerrors": false, "listformats": null, "logtostderr": false, "matchtitle": null, "max_downloads": null, "nooverwrites": false, "nopart": false, "noprogress": false, "outtmpl": "%(id)s.%(ext)s", "password": null, "playlistend": -1, "playliststart": 1, "prefer_free_formats": false, "quiet": false, "ratelimit": null, "rejecttitle": null, "retries": 10, "simulate": false, "subtitleslang": null, "subtitlesformat": "best", "test": true, "updatetime": true, "usenetrc": false, "username": null, "verbose": true, "writedescription": false, "writeinfojson": true, "writesubtitles": false, "allsubtitles": false, "listssubtitles": false, "socket_timeout": 20, "fixup": "never" } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/helper.py���������������������������������������������������������������������������0000644�0000000�0000000�00000021357�12641030331�014456� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import errno import io import hashlib import json import os.path import re import types import sys import youtube_dl.extractor from youtube_dl import YoutubeDL from youtube_dl.utils import ( compat_str, preferredencoding, write_string, ) def get_params(override=None): PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: parameters = json.load(pf) if override: parameters.update(override) return parameters def try_rm(filename): """ Remove a file if it exists """ try: os.remove(filename) except OSError as ose: if ose.errno != errno.ENOENT: raise def report_warning(message): ''' Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored ''' if sys.stderr.isatty() and os.name != 'nt': _msg_header = '\033[0;33mWARNING:\033[0m' else: _msg_header = 'WARNING:' output = '%s %s\n' % (_msg_header, message) if 'b' in getattr(sys.stderr, 'mode', '') or sys.version_info[0] < 3: output = output.encode(preferredencoding()) sys.stderr.write(output) class FakeYDL(YoutubeDL): def __init__(self, override=None): # Different instances of the downloader can't share the same dictionary # some test set the "sublang" parameter, which would break the md5 checks. params = get_params(override=override) super(FakeYDL, self).__init__(params, auto_init=False) self.result = [] def to_screen(self, s, skip_eol=None): print(s) def trouble(self, s, tb=None): raise Exception(s) def download(self, x): self.result.append(x) def expect_warning(self, regex): # Silence an expected warning matching a regex old_report_warning = self.report_warning def report_warning(self, message): if re.match(regex, message): return old_report_warning(message) self.report_warning = types.MethodType(report_warning, self) def gettestcases(include_onlymatching=False): for ie in youtube_dl.extractor.gen_extractors(): for tc in ie.get_testcases(include_onlymatching): yield tc md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() def expect_value(self, got, expected, field): if isinstance(expected, compat_str) and expected.startswith('re:'): match_str = expected[len('re:'):] match_rex = re.compile(match_str) self.assertTrue( isinstance(got, compat_str), 'Expected a %s object, but got %s for field %s' % ( compat_str.__name__, type(got).__name__, field)) self.assertTrue( match_rex.match(got), 'field %s (value: %r) should match %r' % (field, got, match_str)) elif isinstance(expected, compat_str) and expected.startswith('startswith:'): start_str = expected[len('startswith:'):] self.assertTrue( isinstance(got, compat_str), 'Expected a %s object, but got %s for field %s' % ( compat_str.__name__, type(got).__name__, field)) self.assertTrue( got.startswith(start_str), 'field %s (value: %r) should start with %r' % (field, got, start_str)) elif isinstance(expected, compat_str) and expected.startswith('contains:'): contains_str = expected[len('contains:'):] self.assertTrue( isinstance(got, compat_str), 'Expected a %s object, but got %s for field %s' % ( compat_str.__name__, type(got).__name__, field)) self.assertTrue( contains_str in got, 'field %s (value: %r) should contain %r' % (field, got, contains_str)) elif isinstance(expected, type): self.assertTrue( isinstance(got, expected), 'Expected type %r for field %s, but got value %r of type %r' % (expected, field, got, type(got))) elif isinstance(expected, dict) and isinstance(got, dict): expect_dict(self, got, expected) elif isinstance(expected, list) and isinstance(got, list): self.assertEqual( len(expected), len(got), 'Expect a list of length %d, but got a list of length %d for field %s' % ( len(expected), len(got), field)) for index, (item_got, item_expected) in enumerate(zip(got, expected)): type_got = type(item_got) type_expected = type(item_expected) self.assertEqual( type_expected, type_got, 'Type mismatch for list item at index %d for field %s, expected %r, got %r' % ( index, field, type_expected, type_got)) expect_value(self, item_got, item_expected, field) else: if isinstance(expected, compat_str) and expected.startswith('md5:'): got = 'md5:' + md5(got) elif isinstance(expected, compat_str) and expected.startswith('mincount:'): self.assertTrue( isinstance(got, (list, dict)), 'Expected field %s to be a list or a dict, but it is of type %s' % ( field, type(got).__name__)) expected_num = int(expected.partition(':')[2]) assertGreaterEqual( self, len(got), expected_num, 'Expected %d items in field %s, but only got %d' % (expected_num, field, len(got))) return self.assertEqual( expected, got, 'Invalid value for field %s, expected %r, got %r' % (field, expected, got)) def expect_dict(self, got_dict, expected_dict): for info_field, expected in expected_dict.items(): got = got_dict.get(info_field) expect_value(self, got, expected, info_field) def expect_info_dict(self, got_dict, expected_dict): expect_dict(self, got_dict, expected_dict) # Check for the presence of mandatory fields if got_dict.get('_type') not in ('playlist', 'multi_video'): for key in ('id', 'url', 'title', 'ext'): self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key) # Check for mandatory fields that are automatically set by YoutubeDL for key in ['webpage_url', 'extractor', 'extractor_key']: self.assertTrue(got_dict.get(key), 'Missing field: %s' % key) # Are checkable fields missing from the test case definition? test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) for key, value in got_dict.items() if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location', 'age_limit')) missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) if missing_keys: def _repr(v): if isinstance(v, compat_str): return "'%s'" % v.replace('\\', '\\\\').replace("'", "\\'").replace('\n', '\\n') else: return repr(v) info_dict_str = '' if len(missing_keys) != len(expected_dict): info_dict_str += ''.join( ' %s: %s,\n' % (_repr(k), _repr(v)) for k, v in test_info_dict.items() if k not in missing_keys) if info_dict_str: info_dict_str += '\n' info_dict_str += ''.join( ' %s: %s,\n' % (_repr(k), _repr(test_info_dict[k])) for k in missing_keys) write_string( '\n\'info_dict\': {\n' + info_dict_str + '},\n', out=sys.stderr) self.assertFalse( missing_keys, 'Missing keys in test definition: %s' % ( ', '.join(sorted(missing_keys)))) def assertRegexpMatches(self, text, regexp, msg=None): if hasattr(self, 'assertRegexp'): return self.assertRegexp(text, regexp, msg) else: m = re.match(regexp, text) if not m: note = 'Regexp didn\'t match: %r not found' % (regexp) if len(text) < 1000: note += ' in %r' % text if msg is None: msg = note else: msg = note + ', ' + msg self.assertTrue(m, msg) def assertGreaterEqual(self, got, expected, msg=None): if not (got >= expected): if msg is None: msg = '%r not greater than or equal to %r' % (got, expected) self.assertTrue(got >= expected, msg) def expect_warnings(ydl, warnings_re): real_warning = ydl.report_warning def _report_warning(w): if not any(re.search(w_re, w) for w_re in warnings_re): real_warning(w) ydl.report_warning = _report_warning ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/test_utils.py�����������������������������������������������������������������������0000644�0000000�0000000�00000106062�12662061715�015410� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python # coding: utf-8 from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Various small unit tests import io import json import xml.etree.ElementTree from youtube_dl.utils import ( age_restricted, args_to_str, clean_html, DateRange, detect_exe_version, determine_ext, dict_get, encode_compat_str, encodeFilename, escape_rfc3986, escape_url, ExtractorError, find_xpath_attr, fix_xml_ampersands, InAdvancePagedList, intlist_to_bytes, is_html, js_to_json, limit_length, ohdave_rsa_encrypt, OnDemandPagedList, orderedSet, parse_duration, parse_filesize, parse_iso8601, read_batch_urls, sanitize_filename, sanitize_path, prepend_extension, replace_extension, remove_quotes, shell_quote, smuggle_url, str_to_int, strip_jsonp, struct_unpack, timeconvert, unescapeHTML, unified_strdate, unsmuggle_url, uppercase_escape, lowercase_escape, url_basename, urlencode_postdata, version_tuple, xpath_with_ns, xpath_element, xpath_text, xpath_attr, render_table, match_str, parse_dfxp_time_expr, dfxp2srt, cli_option, cli_valueless_option, cli_bool_option, ) from youtube_dl.compat import ( compat_etree_fromstring, ) class TestUtil(unittest.TestCase): def test_timeconvert(self): self.assertTrue(timeconvert('') is None) self.assertTrue(timeconvert('bougrg') is None) def test_sanitize_filename(self): self.assertEqual(sanitize_filename('abc'), 'abc') self.assertEqual(sanitize_filename('abc_d-e'), 'abc_d-e') self.assertEqual(sanitize_filename('123'), '123') self.assertEqual('abc_de', sanitize_filename('abc/de')) self.assertFalse('/' in sanitize_filename('abc/de///')) self.assertEqual('abc_de', sanitize_filename('abc/<>\\*|de')) self.assertEqual('xxx', sanitize_filename('xxx/<>\\*|')) self.assertEqual('yes no', sanitize_filename('yes? no')) self.assertEqual('this - that', sanitize_filename('this: that')) self.assertEqual(sanitize_filename('AT&T'), 'AT&T') aumlaut = 'ä' self.assertEqual(sanitize_filename(aumlaut), aumlaut) tests = '\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0446\u0430' self.assertEqual(sanitize_filename(tests), tests) self.assertEqual( sanitize_filename('New World record at 0:12:34'), 'New World record at 0_12_34') self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf') self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf') self.assertEqual(sanitize_filename('.gasdgf'), 'gasdgf') self.assertEqual(sanitize_filename('.gasdgf', is_id=True), '.gasdgf') forbidden = '"\0\\/' for fc in forbidden: for fbc in forbidden: self.assertTrue(fbc not in sanitize_filename(fc)) def test_sanitize_filename_restricted(self): self.assertEqual(sanitize_filename('abc', restricted=True), 'abc') self.assertEqual(sanitize_filename('abc_d-e', restricted=True), 'abc_d-e') self.assertEqual(sanitize_filename('123', restricted=True), '123') self.assertEqual('abc_de', sanitize_filename('abc/de', restricted=True)) self.assertFalse('/' in sanitize_filename('abc/de///', restricted=True)) self.assertEqual('abc_de', sanitize_filename('abc/<>\\*|de', restricted=True)) self.assertEqual('xxx', sanitize_filename('xxx/<>\\*|', restricted=True)) self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True)) self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True)) tests = 'a\xe4b\u4e2d\u56fd\u7684c' self.assertEqual(sanitize_filename(tests, restricted=True), 'a_b_c') self.assertTrue(sanitize_filename('\xf6', restricted=True) != '') # No empty filename forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#' for fc in forbidden: for fbc in forbidden: self.assertTrue(fbc not in sanitize_filename(fc, restricted=True)) # Handle a common case more neatly self.assertEqual(sanitize_filename('\u5927\u58f0\u5e26 - Song', restricted=True), 'Song') self.assertEqual(sanitize_filename('\u603b\u7edf: Speech', restricted=True), 'Speech') # .. but make sure the file name is never empty self.assertTrue(sanitize_filename('-', restricted=True) != '') self.assertTrue(sanitize_filename(':', restricted=True) != '') def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') def test_sanitize_path(self): if sys.platform != 'win32': return self.assertEqual(sanitize_path('abc'), 'abc') self.assertEqual(sanitize_path('abc/def'), 'abc\\def') self.assertEqual(sanitize_path('abc\\def'), 'abc\\def') self.assertEqual(sanitize_path('abc|def'), 'abc#def') self.assertEqual(sanitize_path('<>:"|?*'), '#######') self.assertEqual(sanitize_path('C:/abc/def'), 'C:\\abc\\def') self.assertEqual(sanitize_path('C?:/abc/def'), 'C##\\abc\\def') self.assertEqual(sanitize_path('\\\\?\\UNC\\ComputerName\\abc'), '\\\\?\\UNC\\ComputerName\\abc') self.assertEqual(sanitize_path('\\\\?\\UNC/ComputerName/abc'), '\\\\?\\UNC\\ComputerName\\abc') self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc') self.assertEqual(sanitize_path('\\\\?\\C:/abc'), '\\\\?\\C:\\abc') self.assertEqual(sanitize_path('\\\\?\\C:\\ab?c\\de:f'), '\\\\?\\C:\\ab#c\\de#f') self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc') self.assertEqual( sanitize_path('youtube/%(uploader)s/%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s'), 'youtube\\%(uploader)s\\%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s') self.assertEqual( sanitize_path('youtube/TheWreckingYard ./00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part'), 'youtube\\TheWreckingYard #\\00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part') self.assertEqual(sanitize_path('abc/def...'), 'abc\\def..#') self.assertEqual(sanitize_path('abc.../def'), 'abc..#\\def') self.assertEqual(sanitize_path('abc.../def...'), 'abc..#\\def..#') self.assertEqual(sanitize_path('../abc'), '..\\abc') self.assertEqual(sanitize_path('../../abc'), '..\\..\\abc') self.assertEqual(sanitize_path('./abc'), 'abc') self.assertEqual(sanitize_path('./../abc'), '..\\abc') def test_prepend_extension(self): self.assertEqual(prepend_extension('abc.ext', 'temp'), 'abc.temp.ext') self.assertEqual(prepend_extension('abc.ext', 'temp', 'ext'), 'abc.temp.ext') self.assertEqual(prepend_extension('abc.unexpected_ext', 'temp', 'ext'), 'abc.unexpected_ext.temp') self.assertEqual(prepend_extension('abc', 'temp'), 'abc.temp') self.assertEqual(prepend_extension('.abc', 'temp'), '.abc.temp') self.assertEqual(prepend_extension('.abc.ext', 'temp'), '.abc.temp.ext') def test_replace_extension(self): self.assertEqual(replace_extension('abc.ext', 'temp'), 'abc.temp') self.assertEqual(replace_extension('abc.ext', 'temp', 'ext'), 'abc.temp') self.assertEqual(replace_extension('abc.unexpected_ext', 'temp', 'ext'), 'abc.unexpected_ext.temp') self.assertEqual(replace_extension('abc', 'temp'), 'abc.temp') self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp') self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp') def test_remove_quotes(self): self.assertEqual(remove_quotes(None), None) self.assertEqual(remove_quotes('"'), '"') self.assertEqual(remove_quotes("'"), "'") self.assertEqual(remove_quotes(';'), ';') self.assertEqual(remove_quotes('";'), '";') self.assertEqual(remove_quotes('""'), '') self.assertEqual(remove_quotes('";"'), ';') def test_ordered_set(self): self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7]) self.assertEqual(orderedSet([]), []) self.assertEqual(orderedSet([1]), [1]) # keep the list ordered self.assertEqual(orderedSet([135, 1, 1, 1]), [135, 1]) def test_unescape_html(self): self.assertEqual(unescapeHTML('%20;'), '%20;') self.assertEqual(unescapeHTML('/'), '/') self.assertEqual(unescapeHTML('/'), '/') self.assertEqual(unescapeHTML('é'), 'é') self.assertEqual(unescapeHTML('�'), '�') def test_daterange(self): _20century = DateRange("19000101", "20000101") self.assertFalse("17890714" in _20century) _ac = DateRange("00010101") self.assertTrue("19690721" in _ac) _firstmilenium = DateRange(end="10000101") self.assertTrue("07110427" in _firstmilenium) def test_unified_dates(self): self.assertEqual(unified_strdate('December 21, 2010'), '20101221') self.assertEqual(unified_strdate('8/7/2009'), '20090708') self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214') self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011') self.assertEqual(unified_strdate('1968 12 10'), '19681210') self.assertEqual(unified_strdate('1968-12-10'), '19681210') self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128') self.assertEqual( unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False), '20141126') self.assertEqual( unified_strdate('2/2/2015 6:47:40 PM', day_first=False), '20150202') self.assertEqual(unified_strdate('25-09-2014'), '20140925') self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8') def test_find_xpath_attr(self): testxml = '''<root> <node/> <node x="a"/> <node x="a" y="c" /> <node x="b" y="d" /> <node x="" /> </root>''' doc = compat_etree_fromstring(testxml) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) self.assertEqual(find_xpath_attr(doc, './/node', 'n'), None) self.assertEqual(find_xpath_attr(doc, './/node', 'n', 'v'), None) self.assertEqual(find_xpath_attr(doc, './/node', 'x'), doc[1]) self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1]) self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'b'), doc[3]) self.assertEqual(find_xpath_attr(doc, './/node', 'y'), doc[2]) self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'd'), doc[3]) self.assertEqual(find_xpath_attr(doc, './/node', 'x', ''), doc[4]) def test_xpath_with_ns(self): testxml = '''<root xmlns:media="http://example.com/"> <media:song> <media:author>The Author</media:author> <url>http://server.com/download.mp3</url> </media:song> </root>''' doc = compat_etree_fromstring(testxml) find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) self.assertTrue(find('media:song') is not None) self.assertEqual(find('media:song/media:author').text, 'The Author') self.assertEqual(find('media:song/url').text, 'http://server.com/download.mp3') def test_xpath_element(self): doc = xml.etree.ElementTree.Element('root') div = xml.etree.ElementTree.SubElement(doc, 'div') p = xml.etree.ElementTree.SubElement(div, 'p') p.text = 'Foo' self.assertEqual(xpath_element(doc, 'div/p'), p) self.assertEqual(xpath_element(doc, ['div/p']), p) self.assertEqual(xpath_element(doc, ['div/bar', 'div/p']), p) self.assertEqual(xpath_element(doc, 'div/bar', default='default'), 'default') self.assertEqual(xpath_element(doc, ['div/bar'], default='default'), 'default') self.assertTrue(xpath_element(doc, 'div/bar') is None) self.assertTrue(xpath_element(doc, ['div/bar']) is None) self.assertTrue(xpath_element(doc, ['div/bar'], 'div/baz') is None) self.assertRaises(ExtractorError, xpath_element, doc, 'div/bar', fatal=True) self.assertRaises(ExtractorError, xpath_element, doc, ['div/bar'], fatal=True) self.assertRaises(ExtractorError, xpath_element, doc, ['div/bar', 'div/baz'], fatal=True) def test_xpath_text(self): testxml = '''<root> <div> <p>Foo</p> </div> </root>''' doc = compat_etree_fromstring(testxml) self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default') self.assertTrue(xpath_text(doc, 'div/bar') is None) self.assertRaises(ExtractorError, xpath_text, doc, 'div/bar', fatal=True) def test_xpath_attr(self): testxml = '''<root> <div> <p x="a">Foo</p> </div> </root>''' doc = compat_etree_fromstring(testxml) self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a') self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None) self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None) self.assertEqual(xpath_attr(doc, 'div/bar', 'x', default='default'), 'default') self.assertEqual(xpath_attr(doc, 'div/p', 'y', default='default'), 'default') self.assertRaises(ExtractorError, xpath_attr, doc, 'div/bar', 'x', fatal=True) self.assertRaises(ExtractorError, xpath_attr, doc, 'div/p', 'y', fatal=True) def test_smuggle_url(self): data = {"ö": "ö", "abc": [3]} url = 'https://foo.bar/baz?x=y#a' smug_url = smuggle_url(url, data) unsmug_url, unsmug_data = unsmuggle_url(smug_url) self.assertEqual(url, unsmug_url) self.assertEqual(data, unsmug_data) res_url, res_data = unsmuggle_url(url) self.assertEqual(res_url, url) self.assertEqual(res_data, None) def test_shell_quote(self): args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')] self.assertEqual(shell_quote(args), """ffmpeg -i 'ñ€ß'"'"'.mp4'""") def test_str_to_int(self): self.assertEqual(str_to_int('123,456'), 123456) self.assertEqual(str_to_int('123.456'), 123456) def test_url_basename(self): self.assertEqual(url_basename('http://foo.de/'), '') self.assertEqual(url_basename('http://foo.de/bar/baz'), 'baz') self.assertEqual(url_basename('http://foo.de/bar/baz?x=y'), 'baz') self.assertEqual(url_basename('http://foo.de/bar/baz#x=y'), 'baz') self.assertEqual(url_basename('http://foo.de/bar/baz/'), 'baz') self.assertEqual( url_basename('http://media.w3.org/2010/05/sintel/trailer.mp4'), 'trailer.mp4') def test_parse_duration(self): self.assertEqual(parse_duration(None), None) self.assertEqual(parse_duration(False), None) self.assertEqual(parse_duration('invalid'), None) self.assertEqual(parse_duration('1'), 1) self.assertEqual(parse_duration('1337:12'), 80232) self.assertEqual(parse_duration('9:12:43'), 33163) self.assertEqual(parse_duration('12:00'), 720) self.assertEqual(parse_duration('00:01:01'), 61) self.assertEqual(parse_duration('x:y'), None) self.assertEqual(parse_duration('3h11m53s'), 11513) self.assertEqual(parse_duration('3h 11m 53s'), 11513) self.assertEqual(parse_duration('3 hours 11 minutes 53 seconds'), 11513) self.assertEqual(parse_duration('3 hours 11 mins 53 secs'), 11513) self.assertEqual(parse_duration('62m45s'), 3765) self.assertEqual(parse_duration('6m59s'), 419) self.assertEqual(parse_duration('49s'), 49) self.assertEqual(parse_duration('0h0m0s'), 0) self.assertEqual(parse_duration('0m0s'), 0) self.assertEqual(parse_duration('0s'), 0) self.assertEqual(parse_duration('01:02:03.05'), 3723.05) self.assertEqual(parse_duration('T30M38S'), 1838) self.assertEqual(parse_duration('5 s'), 5) self.assertEqual(parse_duration('3 min'), 180) self.assertEqual(parse_duration('2.5 hours'), 9000) self.assertEqual(parse_duration('02:03:04'), 7384) self.assertEqual(parse_duration('01:02:03:04'), 93784) self.assertEqual(parse_duration('1 hour 3 minutes'), 3780) self.assertEqual(parse_duration('87 Min.'), 5220) def test_fix_xml_ampersands(self): self.assertEqual( fix_xml_ampersands('"&x=y&z=a'), '"&x=y&z=a') self.assertEqual( fix_xml_ampersands('"&x=y&wrong;&z=a'), '"&x=y&wrong;&z=a') self.assertEqual( fix_xml_ampersands('&'><"'), '&'><"') self.assertEqual( fix_xml_ampersands('Ӓ᪼'), 'Ӓ᪼') self.assertEqual(fix_xml_ampersands('&#&#'), '&#&#') def test_paged_list(self): def testPL(size, pagesize, sliceargs, expected): def get_page(pagenum): firstid = pagenum * pagesize upto = min(size, pagenum * pagesize + pagesize) for i in range(firstid, upto): yield i pl = OnDemandPagedList(get_page, pagesize) got = pl.getslice(*sliceargs) self.assertEqual(got, expected) iapl = InAdvancePagedList(get_page, size // pagesize + 1, pagesize) got = iapl.getslice(*sliceargs) self.assertEqual(got, expected) testPL(5, 2, (), [0, 1, 2, 3, 4]) testPL(5, 2, (1,), [1, 2, 3, 4]) testPL(5, 2, (2,), [2, 3, 4]) testPL(5, 2, (4,), [4]) testPL(5, 2, (0, 3), [0, 1, 2]) testPL(5, 2, (1, 4), [1, 2, 3]) testPL(5, 2, (2, 99), [2, 3, 4]) testPL(5, 2, (20, 99), []) def test_struct_unpack(self): self.assertEqual(struct_unpack('!B', b'\x00'), (0,)) def test_read_batch_urls(self): f = io.StringIO('''\xef\xbb\xbf foo bar\r baz # More after this line\r ; or after this bam''') self.assertEqual(read_batch_urls(f), ['foo', 'bar', 'baz', 'bam']) def test_urlencode_postdata(self): data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'}) self.assertTrue(isinstance(data, bytes)) def test_dict_get(self): FALSE_VALUES = { 'none': None, 'false': False, 'zero': 0, 'empty_string': '', 'empty_list': [], } d = FALSE_VALUES.copy() d['a'] = 42 self.assertEqual(dict_get(d, 'a'), 42) self.assertEqual(dict_get(d, 'b'), None) self.assertEqual(dict_get(d, 'b', 42), 42) self.assertEqual(dict_get(d, ('a', )), 42) self.assertEqual(dict_get(d, ('b', 'a', )), 42) self.assertEqual(dict_get(d, ('b', 'c', 'a', 'd', )), 42) self.assertEqual(dict_get(d, ('b', 'c', )), None) self.assertEqual(dict_get(d, ('b', 'c', ), 42), 42) for key, false_value in FALSE_VALUES.items(): self.assertEqual(dict_get(d, ('b', 'c', key, )), None) self.assertEqual(dict_get(d, ('b', 'c', key, ), skip_false_values=False), false_value) def test_encode_compat_str(self): self.assertEqual(encode_compat_str(b'\xd1\x82\xd0\xb5\xd1\x81\xd1\x82', 'utf-8'), 'тест') self.assertEqual(encode_compat_str('тест', 'utf-8'), 'тест') def test_parse_iso8601(self): self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26.1234Z'), 1395612266) self.assertEqual(parse_iso8601('2015-09-29T08:27:31.727'), 1443515251) self.assertEqual(parse_iso8601('2015-09-29T08-27-31.727'), None) def test_strip_jsonp(self): stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);') d = json.loads(stripped) self.assertEqual(d, [{"id": "532cb", "x": 3}]) stripped = strip_jsonp('parseMetadata({"STATUS":"OK"})\n\n\n//epc') d = json.loads(stripped) self.assertEqual(d, {'STATUS': 'OK'}) stripped = strip_jsonp('ps.embedHandler({"status": "success"});') d = json.loads(stripped) self.assertEqual(d, {'status': 'success'}) def test_uppercase_escape(self): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') def test_lowercase_escape(self): self.assertEqual(lowercase_escape('aä'), 'aä') self.assertEqual(lowercase_escape('\\u0026'), '&') def test_limit_length(self): self.assertEqual(limit_length(None, 12), None) self.assertEqual(limit_length('foo', 12), 'foo') self.assertTrue( limit_length('foo bar baz asd', 12).startswith('foo bar')) self.assertTrue('...' in limit_length('foo bar baz asd', 12)) def test_escape_rfc3986(self): reserved = "!*'();:@&=+$,/?#[]" unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~' self.assertEqual(escape_rfc3986(reserved), reserved) self.assertEqual(escape_rfc3986(unreserved), unreserved) self.assertEqual(escape_rfc3986('тест'), '%D1%82%D0%B5%D1%81%D1%82') self.assertEqual(escape_rfc3986('%D1%82%D0%B5%D1%81%D1%82'), '%D1%82%D0%B5%D1%81%D1%82') self.assertEqual(escape_rfc3986('foo bar'), 'foo%20bar') self.assertEqual(escape_rfc3986('foo%20bar'), 'foo%20bar') def test_escape_url(self): self.assertEqual( escape_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavré_FD.mp4'), 'http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavre%CC%81_FD.mp4' ) self.assertEqual( escape_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erklärt/Das-Erste/Video?documentId=22673108&bcastId=5290'), 'http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erkl%C3%A4rt/Das-Erste/Video?documentId=22673108&bcastId=5290' ) self.assertEqual( escape_url('http://тест.рф/фрагмент'), 'http://тест.рф/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82' ) self.assertEqual( escape_url('http://тест.рф/абв?абв=абв#абв'), 'http://тест.рф/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2' ) self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') def test_js_to_json_realworld(self): inp = '''{ 'clip':{'provider':'pseudo'} }''' self.assertEqual(js_to_json(inp), '''{ "clip":{"provider":"pseudo"} }''') json.loads(js_to_json(inp)) inp = '''{ 'playlist':[{'controls':{'all':null}}] }''' self.assertEqual(js_to_json(inp), '''{ "playlist":[{"controls":{"all":null}}] }''') inp = '''"The CW\\'s \\'Crazy Ex-Girlfriend\\'"''' self.assertEqual(js_to_json(inp), '''"The CW's 'Crazy Ex-Girlfriend'"''') inp = '"SAND Number: SAND 2013-7800P\\nPresenter: Tom Russo\\nHabanero Software Training - Xyce Software\\nXyce, Sandia\\u0027s"' json_code = js_to_json(inp) self.assertEqual(json.loads(json_code), json.loads(inp)) def test_js_to_json_edgecases(self): on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}") self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"}) on = js_to_json('{"abc": true}') self.assertEqual(json.loads(on), {'abc': True}) # Ignore JavaScript code as well on = js_to_json('''{ "x": 1, y: "a", z: some.code }''') d = json.loads(on) self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) on = js_to_json('{"abc": "def",}') self.assertEqual(json.loads(on), {'abc': 'def'}) def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') def test_intlist_to_bytes(self): self.assertEqual( intlist_to_bytes([0, 1, 127, 128, 255]), b'\x00\x01\x7f\x80\xff') def test_args_to_str(self): self.assertEqual( args_to_str(['foo', 'ba/r', '-baz', '2 be', '']), 'foo ba/r -baz \'2 be\' \'\'' ) def test_parse_filesize(self): self.assertEqual(parse_filesize(None), None) self.assertEqual(parse_filesize(''), None) self.assertEqual(parse_filesize('91 B'), 91) self.assertEqual(parse_filesize('foobar'), None) self.assertEqual(parse_filesize('2 MiB'), 2097152) self.assertEqual(parse_filesize('5 GB'), 5000000000) self.assertEqual(parse_filesize('1.2Tb'), 1200000000000) self.assertEqual(parse_filesize('1,24 KB'), 1240) def test_version_tuple(self): self.assertEqual(version_tuple('1'), (1,)) self.assertEqual(version_tuple('10.23.344'), (10, 23, 344)) self.assertEqual(version_tuple('10.1-6'), (10, 1, 6)) # avconv style def test_detect_exe_version(self): self.assertEqual(detect_exe_version('''ffmpeg version 1.2.1 built on May 27 2013 08:37:26 with gcc 4.7 (Debian 4.7.3-4) configuration: --prefix=/usr --extra-'''), '1.2.1') self.assertEqual(detect_exe_version('''ffmpeg version N-63176-g1fb4685 built on May 15 2014 22:09:06 with gcc 4.8.2 (GCC)'''), 'N-63176-g1fb4685') self.assertEqual(detect_exe_version('''X server found. dri2 connection failed! Trying to open render node... Success at /dev/dri/renderD128. ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') def test_age_restricted(self): self.assertFalse(age_restricted(None, 10)) # unrestricted content self.assertFalse(age_restricted(1, None)) # unrestricted policy self.assertFalse(age_restricted(8, 10)) self.assertTrue(age_restricted(18, 14)) self.assertFalse(age_restricted(18, 18)) def test_is_html(self): self.assertFalse(is_html(b'\x49\x44\x43<html')) self.assertTrue(is_html(b'<!DOCTYPE foo>\xaaa')) self.assertTrue(is_html( # UTF-8 with BOM b'\xef\xbb\xbf<!DOCTYPE foo>\xaaa')) self.assertTrue(is_html( # UTF-16-LE b'\xff\xfe<\x00h\x00t\x00m\x00l\x00>\x00\xe4\x00' )) self.assertTrue(is_html( # UTF-16-BE b'\xfe\xff\x00<\x00h\x00t\x00m\x00l\x00>\x00\xe4' )) self.assertTrue(is_html( # UTF-32-BE b'\x00\x00\xFE\xFF\x00\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4')) self.assertTrue(is_html( # UTF-32-LE b'\xFF\xFE\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4\x00\x00\x00')) def test_render_table(self): self.assertEqual( render_table( ['a', 'bcd'], [[123, 4], [9999, 51]]), 'a bcd\n' '123 4\n' '9999 51') def test_match_str(self): self.assertRaises(ValueError, match_str, 'xy>foobar', {}) self.assertFalse(match_str('xy', {'x': 1200})) self.assertTrue(match_str('!xy', {'x': 1200})) self.assertTrue(match_str('x', {'x': 1200})) self.assertFalse(match_str('!x', {'x': 1200})) self.assertTrue(match_str('x', {'x': 0})) self.assertFalse(match_str('x>0', {'x': 0})) self.assertFalse(match_str('x>0', {})) self.assertTrue(match_str('x>?0', {})) self.assertTrue(match_str('x>1K', {'x': 1200})) self.assertFalse(match_str('x>2K', {'x': 1200})) self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200})) self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200})) self.assertFalse(match_str('y=a212', {'y': 'foobar42'})) self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'})) self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'})) self.assertTrue(match_str('y!=foobar2', {'y': 'foobar42'})) self.assertFalse(match_str( 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 90, 'description': 'foo'})) self.assertTrue(match_str( 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 190, 'description': 'foo'})) self.assertFalse(match_str( 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 190, 'dislike_count': 60, 'description': 'foo'})) self.assertFalse(match_str( 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 190, 'dislike_count': 10})) def test_parse_dfxp_time_expr(self): self.assertEqual(parse_dfxp_time_expr(None), None) self.assertEqual(parse_dfxp_time_expr(''), None) self.assertEqual(parse_dfxp_time_expr('0.1'), 0.1) self.assertEqual(parse_dfxp_time_expr('0.1s'), 0.1) self.assertEqual(parse_dfxp_time_expr('00:00:01'), 1.0) self.assertEqual(parse_dfxp_time_expr('00:00:01.100'), 1.1) self.assertEqual(parse_dfxp_time_expr('00:00:01:100'), 1.1) def test_dfxp2srt(self): dfxp_data = '''<?xml version="1.0" encoding="UTF-8"?> <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter"> <body> <div xml:lang="en"> <p begin="0" end="1">The following line contains Chinese characters and special symbols</p> <p begin="1" end="2">第二行<br/>♪♪</p> <p begin="2" dur="1"><span>Third<br/>Line</span></p> <p begin="3" end="-1">Lines with invalid timestamps are ignored</p> <p begin="-1" end="-1">Ignore, two</p> <p begin="3" dur="-1">Ignored, three</p> </div> </body> </tt>''' srt_data = '''1 00:00:00,000 --> 00:00:01,000 The following line contains Chinese characters and special symbols 2 00:00:01,000 --> 00:00:02,000 第二行 ♪♪ 3 00:00:02,000 --> 00:00:03,000 Third Line ''' self.assertEqual(dfxp2srt(dfxp_data), srt_data) dfxp_data_no_default_namespace = '''<?xml version="1.0" encoding="UTF-8"?> <tt xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter"> <body> <div xml:lang="en"> <p begin="0" end="1">The first line</p> </div> </body> </tt>''' srt_data = '''1 00:00:00,000 --> 00:00:01,000 The first line ''' self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data) def test_cli_option(self): self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) self.assertEqual(cli_option({}, '--proxy', 'proxy'), []) def test_cli_valueless_option(self): self.assertEqual(cli_valueless_option( {'downloader': 'external'}, '--external-downloader', 'downloader', 'external'), ['--external-downloader']) self.assertEqual(cli_valueless_option( {'downloader': 'internal'}, '--external-downloader', 'downloader', 'external'), []) self.assertEqual(cli_valueless_option( {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate'), ['--no-check-certificate']) self.assertEqual(cli_valueless_option( {'nocheckcertificate': False}, '--no-check-certificate', 'nocheckcertificate'), []) self.assertEqual(cli_valueless_option( {'checkcertificate': True}, '--no-check-certificate', 'checkcertificate', False), []) self.assertEqual(cli_valueless_option( {'checkcertificate': False}, '--no-check-certificate', 'checkcertificate', False), ['--no-check-certificate']) def test_cli_bool_option(self): self.assertEqual( cli_bool_option( {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate'), ['--no-check-certificate', 'true']) self.assertEqual( cli_bool_option( {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate', separator='='), ['--no-check-certificate=true']) self.assertEqual( cli_bool_option( {'nocheckcertificate': True}, '--check-certificate', 'nocheckcertificate', 'false', 'true'), ['--check-certificate', 'false']) self.assertEqual( cli_bool_option( {'nocheckcertificate': True}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), ['--check-certificate=false']) self.assertEqual( cli_bool_option( {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true'), ['--check-certificate', 'true']) self.assertEqual( cli_bool_option( {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), ['--check-certificate=true']) def test_ohdave_rsa_encrypt(self): N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd e = 65537 self.assertEqual( ohdave_rsa_encrypt(b'aa111222', e, N), '726664bd9a23fd0c70f9f1b84aab5e3905ce1e45a584e9cbcf9bcc7510338fc1986d6c599ff990d923aa43c51c0d9013cd572e13bc58f4ae48f2ed8c0b0ba881') if __name__ == '__main__': unittest.main() ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/test_all_urls.py��������������������������������������������������������������������0000644�0000000�0000000�00000015310�12652645300�016055� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import gettestcases from youtube_dl.extractor import ( FacebookIE, gen_extractors, YoutubeIE, ) class TestAllURLsMatching(unittest.TestCase): def setUp(self): self.ies = gen_extractors() def matching_ies(self, url): return [ie.IE_NAME for ie in self.ies if ie.suitable(url) and ie.IE_NAME != 'generic'] def assertMatch(self, url, ie_list): self.assertEqual(self.matching_ies(url), ie_list) def test_youtube_playlist_matching(self): assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist']) assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 assertPlaylist('PL63F0C78739B09958') assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) # Top tracks assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101') def test_youtube_matching(self): self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) # 668 self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube']) self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube']) self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube']) def test_youtube_channel_matching(self): assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') def test_youtube_user_matching(self): self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user']) def test_youtube_feeds(self): self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) def test_youtube_show_matching(self): self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) def test_youtube_search_matching(self): self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') assertExtractId('BaW_jenozKc', 'BaW_jenozKc') def test_facebook_matching(self): self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268')) self.assertTrue(FacebookIE.suitable('https://www.facebook.com/cindyweather?fref=ts#!/photo.php?v=10152183998945793')) def test_no_duplicates(self): ies = gen_extractors() for tc in gettestcases(include_onlymatching=True): url = tc['url'] for ie in ies: if type(ie).__name__ in ('GenericIE', tc['name'] + 'IE'): self.assertTrue(ie.suitable(url), '%s should match URL %r' % (type(ie).__name__, url)) else: self.assertFalse( ie.suitable(url), '%s should not match URL %r . That URL belongs to %s.' % (type(ie).__name__, url, tc['name'])) def test_keywords(self): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) self.assertMatch(':ythistory', ['youtube:history']) self.assertMatch(':thedailyshow', ['ComedyCentralShows']) self.assertMatch(':tds', ['ComedyCentralShows']) def test_vimeo_matching(self): self.assertMatch('https://vimeo.com/channels/tributes', ['vimeo:channel']) self.assertMatch('https://vimeo.com/channels/31259', ['vimeo:channel']) self.assertMatch('https://vimeo.com/channels/31259/53576664', ['vimeo']) self.assertMatch('https://vimeo.com/user7108434', ['vimeo:user']) self.assertMatch('https://vimeo.com/user7108434/videos', ['vimeo:user']) self.assertMatch('https://vimeo.com/user21297594/review/75524534/3c257a1b5d', ['vimeo:review']) # https://github.com/rg3/youtube-dl/issues/1930 def test_soundcloud_not_matching_sets(self): self.assertMatch('http://soundcloud.com/floex/sets/gone-ep', ['soundcloud:set']) def test_tumblr(self): self.assertMatch('http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', ['Tumblr']) self.assertMatch('http://tatianamaslanydaily.tumblr.com/post/54196191430', ['Tumblr']) def test_pbs(self): # https://github.com/rg3/youtube-dl/issues/2350 self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['pbs']) self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['pbs']) def test_yahoo_https(self): # https://github.com/rg3/youtube-dl/issues/2701 self.assertMatch( 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', ['Yahoo']) if __name__ == '__main__': unittest.main() ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/test_netrc.py�����������������������������������������������������������������������0000644�0000000�0000000�00000001173�12641030331�015343� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.extractor import ( gen_extractors, ) class TestNetRc(unittest.TestCase): def test_netrc_present(self): for ie in gen_extractors(): if not hasattr(ie, '_login'): continue self.assertTrue( hasattr(ie, '_NETRC_MACHINE'), 'Extractor %s supports login, but is missing a _NETRC_MACHINE property' % ie.IE_NAME) if __name__ == '__main__': unittest.main() �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/test_aes.py�������������������������������������������������������������������������0000644�0000000�0000000�00000003760�12641030331�015004� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_decrypt_text from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes import base64 # the encrypted data can be generate with 'devscripts/generate_aes_testdata.py' class TestAES(unittest.TestCase): def setUp(self): self.key = self.iv = [0x20, 0x15] + 14 * [0] self.secret_msg = b'Secret message goes here' def test_encrypt(self): msg = b'message' key = list(range(16)) encrypted = aes_encrypt(bytes_to_intlist(msg), key) decrypted = intlist_to_bytes(aes_decrypt(encrypted, key)) self.assertEqual(decrypted, msg) def test_cbc_decrypt(self): data = bytes_to_intlist( b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd" ) decrypted = intlist_to_bytes(aes_cbc_decrypt(data, self.key, self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_decrypt_text(self): password = intlist_to_bytes(self.key).decode('utf-8') encrypted = base64.b64encode( intlist_to_bytes(self.iv[:8]) + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 16)) self.assertEqual(decrypted, self.secret_msg) password = intlist_to_bytes(self.key).decode('utf-8') encrypted = base64.b64encode( intlist_to_bytes(self.iv[:8]) + b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) if __name__ == '__main__': unittest.main() ����������������youtube-dl/test/video-vid.mp4�����������������������������������������������������������������������0000644�0000000�0000000�00000000016�12462561016�015134� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������� ft[video]������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/test_update.py����������������������������������������������������������������������0000644�0000000�0000000�00000002125�12650650456�015530� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import json from youtube_dl.update import rsa_verify class TestUpdate(unittest.TestCase): def test_rsa_verify(self): UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537) with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'versions.json'), 'rb') as f: versions_info = f.read().decode() versions_info = json.loads(versions_info) signature = versions_info['signature'] del versions_info['signature'] self.assertTrue(rsa_verify( json.dumps(versions_info, sort_keys=True).encode('utf-8'), signature, UPDATES_RSA_KEY)) if __name__ == '__main__': unittest.main() �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/test_age_restriction.py�������������������������������������������������������������0000644�0000000�0000000�00000002543�12641030331�017413� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python from __future__ import unicode_literals # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import try_rm from youtube_dl import YoutubeDL def _download_restricted(url, filename, age): """ Returns true if the file has been downloaded """ params = { 'age_limit': age, 'skip_download': True, 'writeinfojson': True, 'outtmpl': '%(id)s.%(ext)s', } ydl = YoutubeDL(params) ydl.add_default_info_extractors() json_filename = os.path.splitext(filename)[0] + '.info.json' try_rm(json_filename) ydl.download([url]) res = os.path.exists(json_filename) try_rm(json_filename) return res class TestAgeRestriction(unittest.TestCase): def _assert_restricted(self, url, filename, age, old_age=None): self.assertTrue(_download_restricted(url, filename, old_age)) self.assertFalse(_download_restricted(url, filename, age)) def test_youtube(self): self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10) def test_youporn(self): self._assert_restricted( 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', '505835.mp4', 2, old_age=25) if __name__ == '__main__': unittest.main() �������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/test/testcert.pem������������������������������������������������������������������������0000644�0000000�0000000�00000006150�12641030331�015157� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������-----BEGIN PRIVATE KEY----- MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDMF0bAzaHAdIyB HRmnIp4vv40lGqEePmWqicCl0QZ0wsb5dNysSxSa7330M2QeQopGfdaUYF1uTcNp Qx6ECgBSfg+RrOBI7r/u4F+sKX8MUXVaf/5QoBUrGNGSn/pp7HMGOuQqO6BVg4+h A1ySSwUG8mZItLRry1ISyErmW8b9xlqfd97uLME/5tX+sMelRFjUbAx8A4CK58Ev mMguHVTlXzx5RMdYcf1VScYcjlV/qA45uzP8zwI5aigfcmUD+tbGuQRhKxUhmw0J aobtOR6+JSOAULW5gYa/egE4dWLwbyM6b6eFbdnjlQzEA1EW7ChMPAW/Mo83KyiP tKMCSQulAgMBAAECggEALCfBDAexPjU5DNoh6bIorUXxIJzxTNzNHCdvgbCGiA54 BBKPh8s6qwazpnjT6WQWDIg/O5zZufqjE4wM9x4+0Zoqfib742ucJO9wY4way6x4 Clt0xzbLPabB+MoZ4H7ip+9n2+dImhe7pGdYyOHoNYeOL57BBi1YFW42Hj6u/8pd 63YCXisto3Rz1YvRQVjwsrS+cRKZlzAFQRviL30jav7Wh1aWEfcXxjj4zhm8pJdk ITGtq6howz57M0NtX6hZnfe8ywzTnDFIGKIMA2cYHuYJcBh9bc4tCGubTvTKK9UE 8fM+f6UbfGqfpKCq1mcgs0XMoFDSzKS9+mSJn0+5JQKBgQD+OCKaeH3Yzw5zGnlw XuQfMJGNcgNr+ImjmvzUAC2fAZUJLAcQueE5kzMv5Fmd+EFE2CEX1Vit3tg0SXvA G+bq609doILHMA03JHnV1npO/YNIhG3AAtJlKYGxQNfWH9mflYj9mEui8ZFxG52o zWhHYuifOjjZszUR+/eio6NPzwKBgQDNhUBTrT8LIX4SE/EFUiTlYmWIvOMgXYvN 8Cm3IRNQ/yyphZaXEU0eJzfX5uCDfSVOgd6YM/2pRah+t+1Hvey4H8e0GVTu5wMP gkkqwKPGIR1YOmlw6ippqwvoJD7LuYrm6Q4D6e1PvkjwCq6lEndrOPmPrrXNd0JJ XO60y3U2SwKBgQDLkyZarryQXxcCI6Q10Tc6pskYDMIit095PUbTeiUOXNT9GE28 Hi32ziLCakk9kCysNasii81MxtQ54tJ/f5iGbNMMddnkKl2a19Hc5LjjAm4cJzg/ 98KGEhvyVqvAo5bBDZ06/rcrD+lZOzUglQS5jcIcqCIYa0LHWQ/wJLxFzwKBgFcZ 1SRhdSmDfUmuF+S4ZpistflYjC3IV5rk4NkS9HvMWaJS0nqdw4A3AMzItXgkjq4S DkOVLTkTI5Do5HAWRv/VwC5M2hkR4NMu1VGAKSisGiKtRsirBWSZMEenLNHshbjN Jrpz5rZ4H7NT46ZkCCZyFBpX4gb9NyOedjA7Via3AoGARF8RxbYjnEGGFuhnbrJB FTPR0vaL4faY3lOgRZ8jOG9V2c9Hzi/y8a8TU4C11jnJSDqYCXBTd5XN28npYxtD pjRsCwy6ze+yvYXPO7C978eMG3YRyj366NXUxnXN59ibwe/lxi2OD9z8J1LEdF6z VJua1Wn8HKxnXMI61DhTCSo= -----END PRIVATE KEY----- -----BEGIN CERTIFICATE----- MIIEEzCCAvugAwIBAgIJAK1haYi6gmSKMA0GCSqGSIb3DQEBCwUAMIGeMQswCQYD VQQGEwJERTEMMAoGA1UECAwDTlJXMRQwEgYDVQQHDAtEdWVzc2VsZG9yZjEbMBkG A1UECgwSeW91dHViZS1kbCBwcm9qZWN0MRkwFwYDVQQLDBB5b3V0dWJlLWRsIHRl c3RzMRIwEAYDVQQDDAlsb2NhbGhvc3QxHzAdBgkqhkiG9w0BCQEWEHBoaWhhZ0Bw aGloYWcuZGUwIBcNMTUwMTMwMDExNTA4WhgPMjExNTAxMDYwMTE1MDhaMIGeMQsw CQYDVQQGEwJERTEMMAoGA1UECAwDTlJXMRQwEgYDVQQHDAtEdWVzc2VsZG9yZjEb MBkGA1UECgwSeW91dHViZS1kbCBwcm9qZWN0MRkwFwYDVQQLDBB5b3V0dWJlLWRs IHRlc3RzMRIwEAYDVQQDDAlsb2NhbGhvc3QxHzAdBgkqhkiG9w0BCQEWEHBoaWhh Z0BwaGloYWcuZGUwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDMF0bA zaHAdIyBHRmnIp4vv40lGqEePmWqicCl0QZ0wsb5dNysSxSa7330M2QeQopGfdaU YF1uTcNpQx6ECgBSfg+RrOBI7r/u4F+sKX8MUXVaf/5QoBUrGNGSn/pp7HMGOuQq O6BVg4+hA1ySSwUG8mZItLRry1ISyErmW8b9xlqfd97uLME/5tX+sMelRFjUbAx8 A4CK58EvmMguHVTlXzx5RMdYcf1VScYcjlV/qA45uzP8zwI5aigfcmUD+tbGuQRh KxUhmw0JaobtOR6+JSOAULW5gYa/egE4dWLwbyM6b6eFbdnjlQzEA1EW7ChMPAW/ Mo83KyiPtKMCSQulAgMBAAGjUDBOMB0GA1UdDgQWBBTBUZoqhQkzHQ6xNgZfFxOd ZEVt8TAfBgNVHSMEGDAWgBTBUZoqhQkzHQ6xNgZfFxOdZEVt8TAMBgNVHRMEBTAD AQH/MA0GCSqGSIb3DQEBCwUAA4IBAQCUOCl3T/J9B08Z+ijfOJAtkbUaEHuVZb4x 5EpZSy2ZbkLvtsftMFieHVNXn9dDswQc5qjYStCC4o60LKw4M6Y63FRsAZ/DNaqb PY3jyCyuugZ8/sNf50vHYkAcF7SQYqOQFQX4TQsNUk2xMJIt7H0ErQFmkf/u3dg6 cy89zkT462IwxzSG7NNhIlRkL9o5qg+Y1mF9eZA1B0rcL6hO24PPTHOd90HDChBu SZ6XMi/LzYQSTf0Vg2R+uMIVlzSlkdcZ6sqVnnqeLL8dFyIa4e9sj/D4ZCYP8Mqe Z73H5/NNhmwCHRqVUTgm307xblQaWGhwAiDkaRvRW2aJQ0qGEdZK -----END CERTIFICATE----- ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/������������������������������������������������������������������������������0000755�0000000�0000000�00000000000�12662564630�014032� 5����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/YoutubeDL.py������������������������������������������������������������������0000755�0000000�0000000�00000270623�12660177411�016270� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import absolute_import, unicode_literals import collections import contextlib import datetime import errno import fileinput import io import itertools import json import locale import operator import os import platform import re import shutil import subprocess import socket import sys import time import tokenize import traceback if os.name == 'nt': import ctypes from .compat import ( compat_basestring, compat_cookiejar, compat_expanduser, compat_get_terminal_size, compat_http_client, compat_kwargs, compat_str, compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, compat_urllib_request_DataHandler, ) from .utils import ( ContentTooShortError, date_from_str, DateRange, DEFAULT_OUTTMPL, determine_ext, determine_protocol, DownloadError, encode_compat_str, encodeFilename, error_to_compat_str, ExtractorError, format_bytes, formatSeconds, locked_file, make_HTTPS_handler, MaxDownloadsReached, PagedList, parse_filesize, PerRequestProxyHandler, PostProcessingError, platform_name, preferredencoding, render_table, SameFileError, sanitize_filename, sanitize_path, sanitized_Request, std_headers, subtitles_filename, UnavailableVideoError, url_basename, version_tuple, write_json_file, write_string, YoutubeDLCookieProcessor, YoutubeDLHandler, prepend_extension, replace_extension, args_to_str, age_restricted, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractors from .downloader import get_suitable_downloader from .downloader.rtmp import rtmpdump_version from .postprocessor import ( FFmpegFixupM4aPP, FFmpegFixupStretchedPP, FFmpegMergerPP, FFmpegPostProcessor, get_postprocessor, ) from .version import __version__ class YoutubeDL(object): """YoutubeDL class. YoutubeDL objects are the ones responsible of downloading the actual video file and writing it to disk if the user has requested it, among some other tasks. In most cases there should be one per program. As, given a video URL, the downloader doesn't know how to extract all the needed information, task that InfoExtractors do, it has to pass the URL to one of them. For this, YoutubeDL objects have a method that allows InfoExtractors to be registered in a given order. When it is passed a URL, the YoutubeDL object handles it to the first InfoExtractor it finds that reports being able to handle it. The InfoExtractor extracts all the information about the video or videos the URL refers to, and YoutubeDL process the extracted information, possibly using a File Downloader to download the video. YoutubeDL objects accept a lot of parameters. In order not to saturate the object constructor with arguments, it receives a dictionary of options instead. These options are available through the params attribute for the InfoExtractors to use. The YoutubeDL also registers itself as the downloader in charge for the InfoExtractors that are added to it, so this is a "mutual registration". Available options: username: Username for authentication purposes. password: Password for authentication purposes. videopassword: Password for accessing a video. usenetrc: Use netrc for authentication instead. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. no_warnings: Do not print out anything for warnings. forceurl: Force printing final URL. forcetitle: Force printing title. forceid: Force printing ID. forcethumbnail: Force printing thumbnail URL. forcedescription: Force printing description. forcefilename: Force printing final filename. forceduration: Force printing duration. forcejson: Force printing info_dict as JSON. dump_single_json: Force printing the info_dict of the whole playlist (or video) as a single JSON line. simulate: Do not download the video files. format: Video format code. See options.py for more information. outtmpl: Template for output names. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. force_generic_extractor: Force downloader to use the generic extractor nooverwrites: Prevent overwriting files. playliststart: Playlist item to start at. playlistend: Playlist item to end at. playlist_items: Specific indices of playlist to download. playlistreverse: Download playlist items in reverse order. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. logtostderr: Log messages to stderr instead of stdout. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file writeannotations: Write the video annotations to a .annotations.xml file writethumbnail: Write the thumbnail image to a file write_all_thumbnails: Write all thumbnail formats to files writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatically generated subtitles to a file allsubtitles: Downloads all the subtitles of the video (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video subtitlesformat: The format code for subtitles subtitleslangs: List of languages of the subtitles to download keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file cachedir: Location of the cache files in the filesystem. False to disable filesystem cache. noplaylist: Download single video instead of a playlist if in doubt. age_limit: An integer representing the user's age in years. Unsuitable videos for the given age are skipped. min_views: An integer representing the minimum view count the video must have in order to not be skipped. Videos without view count information are always downloaded. None for no limit. max_views: An integer representing the maximum view count. Videos that are more popular than that are not downloaded. Videos without view count information are always downloaded. None for no limit. download_archive: File name of a file where all downloads are recorded. Videos already present in the file are not downloaded again. cookiefile: File name where cookies should be read from and dumped to. nocheckcertificate:Do not verify SSL certificates prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use cn_verification_proxy: URL of the proxy to use for IP address verification on Chinese sites. (Experimental) socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi debug_printtraffic:Print out sent and received HTTP traffic include_ads: Download ads as well default_search: Prepend this string if an input url is not valid. 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. extract_flat: Do not resolve URLs, return the immediate result. Pass in 'in_playlist' to only show this behavior for playlist items. postprocessors: A list of dictionaries, each with an entry * key: The name of the postprocessor. See youtube_dl/postprocessor/__init__.py for a list. as well as any further keyword arguments for the postprocessor. progress_hooks: A list of functions that get called on download progress, with a dictionary with the entries * status: One of "downloading", "error", or "finished". Check this first and ignore unknown values. If status is one of "downloading", or "finished", the following properties may also be present: * filename: The final filename (always present) * tmpfilename: The filename we're currently writing to * downloaded_bytes: Bytes on disk * total_bytes: Size of the whole file, None if unknown * total_bytes_estimate: Guess of the eventual file size, None if unavailable. * elapsed: The number of seconds since download started. * eta: The estimated time in seconds, None if unknown * speed: The download speed in bytes/second, None if unknown * fragment_index: The counter of the currently downloaded video fragment. * fragment_count: The number of fragments (= individual files that will be merged) Progress hooks are guaranteed to be called at least once (with status "finished") if the download is successful. merge_output_format: Extension to use when merging formats. fixup: Automatically correct known faults of the file. One of: - "never": do nothing - "warn": only emit a warning - "detect_or_warn": check whether we can do anything about it, warn otherwise (default) source_address: (Experimental) Client-side IP address to bind to. call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. sleep_interval: Number of seconds to sleep before each download. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. match_filter: A function that gets called with the info_dict of every video. If it returns a message, the video is ignored. If it returns None, the video is downloaded. match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. The following options determine which downloader is picked: external_downloader: Executable of the external downloader to call. None or unset for standard (built-in) downloader. hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv. The following parameters are not used by YoutubeDL itself, they are used by the downloader (see youtube_dl/downloader/common.py): nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle, xattr_set_filesize, external_downloader_args, hls_use_mpegts. The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, otherwise prefer avconv. postprocessor_args: A list of additional command-line arguments for the postprocessor. """ params = None _ies = [] _pps = [] _download_retcode = None _num_downloads = None _screen_file = None def __init__(self, params=None, auto_init=True): """Create a FileDownloader object with the given options.""" if params is None: params = {} self._ies = [] self._ies_instances = {} self._pps = [] self._progress_hooks = [] self._download_retcode = 0 self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] self._err_file = sys.stderr self.params = { # Default parameters 'nocheckcertificate': False, } self.params.update(params) self.cache = Cache(self) if params.get('bidi_workaround', False): try: import pty master, slave = pty.openpty() width = compat_get_terminal_size().columns if width is None: width_args = [] else: width_args = ['-w', str(width)] sp_kwargs = dict( stdin=subprocess.PIPE, stdout=slave, stderr=self._err_file) try: self._output_process = subprocess.Popen( ['bidiv'] + width_args, **sp_kwargs ) except OSError: self._output_process = subprocess.Popen( ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) self._output_channel = os.fdopen(master, 'rb') except OSError as ose: if ose.errno == 2: self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') else: raise if (sys.version_info >= (3,) and sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and not params.get('restrictfilenames', False)): # On Python 3, the Unicode filesystem API will throw errors (#1474) self.report_warning( 'Assuming --restrict-filenames since file system encoding ' 'cannot encode all characters. ' 'Set the LC_ALL environment variable to fix this.') self.params['restrictfilenames'] = True if isinstance(params.get('outtmpl'), bytes): self.report_warning( 'Parameter outtmpl is bytes, but should be a unicode string. ' 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') self._setup_opener() if auto_init: self.print_debug_header() self.add_default_info_extractors() for pp_def_raw in self.params.get('postprocessors', []): pp_class = get_postprocessor(pp_def_raw['key']) pp_def = dict(pp_def_raw) del pp_def['key'] pp = pp_class(self, **compat_kwargs(pp_def)) self.add_post_processor(pp) for ph in self.params.get('progress_hooks', []): self.add_progress_hook(ph) def warn_if_short_id(self, argv): # short YouTube ID starting with dash? idxs = [ i for i, a in enumerate(argv) if re.match(r'^-[0-9A-Za-z_-]{10}$', a)] if idxs: correct_argv = ( ['youtube-dl'] + [a for i, a in enumerate(argv) if i not in idxs] + ['--'] + [argv[i] for i in idxs] ) self.report_warning( 'Long argument string detected. ' 'Use -- to separate parameters and URLs, like this:\n%s\n' % args_to_str(correct_argv)) def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" self._ies.append(ie) self._ies_instances[ie.ie_key()] = ie ie.set_downloader(self) def get_info_extractor(self, ie_key): """ Get an instance of an IE with name ie_key, it will try to get one from the _ies list, if there's no instance it will create a new one and add it to the extractor list. """ ie = self._ies_instances.get(ie_key) if ie is None: ie = get_info_extractor(ie_key)() self.add_info_extractor(ie) return ie def add_default_info_extractors(self): """ Add the InfoExtractors returned by gen_extractors to the end of the list """ for ie in gen_extractors(): self.add_info_extractor(ie) def add_post_processor(self, pp): """Add a PostProcessor object to the end of the chain.""" self._pps.append(pp) pp.set_downloader(self) def add_progress_hook(self, ph): """Add the progress hook (currently only for the file downloader)""" self._progress_hooks.append(ph) def _bidi_workaround(self, message): if not hasattr(self, '_output_channel'): return message assert hasattr(self, '_output_process') assert isinstance(message, compat_str) line_count = message.count('\n') + 1 self._output_process.stdin.write((message + '\n').encode('utf-8')) self._output_process.stdin.flush() res = ''.join(self._output_channel.readline().decode('utf-8') for _ in range(line_count)) return res[:-len('\n')] def to_screen(self, message, skip_eol=False): """Print message to stdout if not in quiet mode.""" return self.to_stdout(message, skip_eol, check_quiet=True) def _write_string(self, s, out=None): write_string(s, out=out, encoding=self.params.get('encoding')) def to_stdout(self, message, skip_eol=False, check_quiet=False): """Print message to stdout if not in quiet mode.""" if self.params.get('logger'): self.params['logger'].debug(message) elif not check_quiet or not self.params.get('quiet', False): message = self._bidi_workaround(message) terminator = ['\n', ''][skip_eol] output = message + terminator self._write_string(output, self._screen_file) def to_stderr(self, message): """Print message to stderr.""" assert isinstance(message, compat_str) if self.params.get('logger'): self.params['logger'].error(message) else: message = self._bidi_workaround(message) output = message + '\n' self._write_string(output, self._err_file) def to_console_title(self, message): if not self.params.get('consoletitle', False): return if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): # c_wchar_p() might not be necessary if `message` is # already of type unicode() ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) elif 'TERM' in os.environ: self._write_string('\033]0;%s\007' % message, self._screen_file) def save_console_title(self): if not self.params.get('consoletitle', False): return if 'TERM' in os.environ: # Save the title on stack self._write_string('\033[22;0t', self._screen_file) def restore_console_title(self): if not self.params.get('consoletitle', False): return if 'TERM' in os.environ: # Restore the title from stack self._write_string('\033[23;0t', self._screen_file) def __enter__(self): self.save_console_title() return self def __exit__(self, *args): self.restore_console_title() if self.params.get('cookiefile') is not None: self.cookiejar.save() def trouble(self, message=None, tb=None): """Determine action to take when a download problem appears. Depending on if the downloader has been configured to ignore download errors or not, this method may throw an exception or not when errors are found, after printing the message. tb, if given, is additional traceback information. """ if message is not None: self.to_stderr(message) if self.params.get('verbose'): if tb is None: if sys.exc_info()[0]: # if .trouble has been called from an except block tb = '' if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) tb += encode_compat_str(traceback.format_exc()) else: tb_data = traceback.format_list(traceback.extract_stack()) tb = ''.join(tb_data) self.to_stderr(tb) if not self.params.get('ignoreerrors', False): if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: exc_info = sys.exc_info()[1].exc_info else: exc_info = sys.exc_info() raise DownloadError(message, exc_info) self._download_retcode = 1 def report_warning(self, message): ''' Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored ''' if self.params.get('logger') is not None: self.params['logger'].warning(message) else: if self.params.get('no_warnings'): return if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': _msg_header = '\033[0;33mWARNING:\033[0m' else: _msg_header = 'WARNING:' warning_message = '%s %s' % (_msg_header, message) self.to_stderr(warning_message) def report_error(self, message, tb=None): ''' Do the same as trouble, but prefixes the message with 'ERROR:', colored in red if stderr is a tty file. ''' if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': _msg_header = '\033[0;31mERROR:\033[0m' else: _msg_header = 'ERROR:' error_message = '%s %s' % (_msg_header, message) self.trouble(error_message, tb) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" try: self.to_screen('[download] %s has already been downloaded' % file_name) except UnicodeEncodeError: self.to_screen('[download] The file has already been downloaded') def prepare_filename(self, info_dict): """Generate the output filename.""" try: template_dict = dict(info_dict) template_dict['epoch'] = int(time.time()) autonumber_size = self.params.get('autonumber_size') if autonumber_size is None: autonumber_size = 5 autonumber_templ = '%0' + str(autonumber_size) + 'd' template_dict['autonumber'] = autonumber_templ % self._num_downloads if template_dict.get('playlist_index') is not None: template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index']) if template_dict.get('resolution') is None: if template_dict.get('width') and template_dict.get('height'): template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height']) elif template_dict.get('height'): template_dict['resolution'] = '%sp' % template_dict['height'] elif template_dict.get('width'): template_dict['resolution'] = '?x%d' % template_dict['width'] sanitize = lambda k, v: sanitize_filename( compat_str(v), restricted=self.params.get('restrictfilenames'), is_id=(k == 'id')) template_dict = dict((k, sanitize(k, v)) for k, v in template_dict.items() if v is not None) template_dict = collections.defaultdict(lambda: 'NA', template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) tmpl = compat_expanduser(outtmpl) filename = tmpl % template_dict # Temporary fix for #4787 # 'Treat' all problem characters by passing filename through preferredencoding # to workaround encoding issues with subprocess on python2 @ Windows if sys.version_info < (3, 0) and sys.platform == 'win32': filename = encodeFilename(filename, True).decode(preferredencoding()) return sanitize_path(filename) except ValueError as err: self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None def _match_entry(self, info_dict, incomplete): """ Returns None iff the file should be downloaded """ video_title = info_dict.get('title', info_dict.get('id', 'video')) if 'title' in info_dict: # This can happen when we're just evaluating the playlist title = info_dict['title'] matchtitle = self.params.get('matchtitle', False) if matchtitle: if not re.search(matchtitle, title, re.IGNORECASE): return '"' + title + '" title did not match pattern "' + matchtitle + '"' rejecttitle = self.params.get('rejecttitle', False) if rejecttitle: if re.search(rejecttitle, title, re.IGNORECASE): return '"' + title + '" title matched reject pattern "' + rejecttitle + '"' date = info_dict.get('upload_date') if date is not None: dateRange = self.params.get('daterange', DateRange()) if date not in dateRange: return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) view_count = info_dict.get('view_count') if view_count is not None: min_views = self.params.get('min_views') if min_views is not None and view_count < min_views: return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views) max_views = self.params.get('max_views') if max_views is not None and view_count > max_views: return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): return 'Skipping "%s" because it is age restricted' % video_title if self.in_download_archive(info_dict): return '%s has already been recorded in archive' % video_title if not incomplete: match_filter = self.params.get('match_filter') if match_filter is not None: ret = match_filter(info_dict) if ret is not None: return ret return None @staticmethod def add_extra_info(info_dict, extra_info): '''Set the keys from extra_info in info dict if they are missing''' for key, value in extra_info.items(): info_dict.setdefault(key, value) def extract_info(self, url, download=True, ie_key=None, extra_info={}, process=True, force_generic_extractor=False): ''' Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. extra_info is a dict containing the extra values to add to each result ''' if not ie_key and force_generic_extractor: ie_key = 'Generic' if ie_key: ies = [self.get_info_extractor(ie_key)] else: ies = self._ies for ie in ies: if not ie.suitable(url): continue if not ie.working(): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') try: ie_result = ie.extract(url) if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) break if isinstance(ie_result, list): # Backwards compatibility: old IE result format ie_result = { '_type': 'compat_list', 'entries': ie_result, } self.add_default_extra_info(ie_result, ie, url) if process: return self.process_ie_result(ie_result, download, extra_info) else: return ie_result except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) break except MaxDownloadsReached: raise except Exception as e: if self.params.get('ignoreerrors', False): self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) break else: raise else: self.report_error('no suitable InfoExtractor for URL %s' % url) def add_default_extra_info(self, ie_result, ie, url): self.add_extra_info(ie_result, { 'extractor': ie.IE_NAME, 'webpage_url': url, 'webpage_url_basename': url_basename(url), 'extractor_key': ie.ie_key(), }) def process_ie_result(self, ie_result, download=True, extra_info={}): """ Take the result of the ie(may be modified) and resolve all unresolved references (URLs, playlist items). It will also download the videos if 'download'. Returns the resolved ie_result. """ result_type = ie_result.get('_type', 'video') if result_type in ('url', 'url_transparent'): extract_flat = self.params.get('extract_flat', False) if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or extract_flat is True): if self.params.get('forcejson', False): self.to_stdout(json.dumps(ie_result)) return ie_result if result_type == 'video': self.add_extra_info(ie_result, extra_info) return self.process_video_result(ie_result, download=download) elif result_type == 'url': # We have to add extra_info to the results because it may be # contained in a playlist return self.extract_info(ie_result['url'], download, ie_key=ie_result.get('ie_key'), extra_info=extra_info) elif result_type == 'url_transparent': # Use the information from the embedding page info = self.extract_info( ie_result['url'], ie_key=ie_result.get('ie_key'), extra_info=extra_info, download=False, process=False) force_properties = dict( (k, v) for k, v in ie_result.items() if v is not None) for f in ('_type', 'url', 'ie_key'): if f in force_properties: del force_properties[f] new_result = info.copy() new_result.update(force_properties) assert new_result.get('_type') != 'url_transparent' return self.process_ie_result( new_result, download=download, extra_info=extra_info) elif result_type == 'playlist' or result_type == 'multi_video': # We process each entry in the playlist playlist = ie_result.get('title') or ie_result.get('id') self.to_screen('[download] Downloading playlist: %s' % playlist) playlist_results = [] playliststart = self.params.get('playliststart', 1) - 1 playlistend = self.params.get('playlistend') # For backwards compatibility, interpret -1 as whole list if playlistend == -1: playlistend = None playlistitems_str = self.params.get('playlist_items') playlistitems = None if playlistitems_str is not None: def iter_playlistitems(format): for string_segment in format.split(','): if '-' in string_segment: start, end = string_segment.split('-') for item in range(int(start), int(end) + 1): yield int(item) else: yield int(string_segment) playlistitems = iter_playlistitems(playlistitems_str) ie_entries = ie_result['entries'] if isinstance(ie_entries, list): n_all_entries = len(ie_entries) if playlistitems: entries = [ ie_entries[i - 1] for i in playlistitems if -n_all_entries <= i - 1 < n_all_entries] else: entries = ie_entries[playliststart:playlistend] n_entries = len(entries) self.to_screen( '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % (ie_result['extractor'], playlist, n_all_entries, n_entries)) elif isinstance(ie_entries, PagedList): if playlistitems: entries = [] for item in playlistitems: entries.extend(ie_entries.getslice( item - 1, item )) else: entries = ie_entries.getslice( playliststart, playlistend) n_entries = len(entries) self.to_screen( '[%s] playlist %s: Downloading %d videos' % (ie_result['extractor'], playlist, n_entries)) else: # iterable if playlistitems: entry_list = list(ie_entries) entries = [entry_list[i - 1] for i in playlistitems] else: entries = list(itertools.islice( ie_entries, playliststart, playlistend)) n_entries = len(entries) self.to_screen( '[%s] playlist %s: Downloading %d videos' % (ie_result['extractor'], playlist, n_entries)) if self.params.get('playlistreverse', False): entries = entries[::-1] for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) extra = { 'n_entries': n_entries, 'playlist': playlist, 'playlist_id': ie_result.get('id'), 'playlist_title': ie_result.get('title'), 'playlist_index': i + playliststart, 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], 'webpage_url_basename': url_basename(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], } reason = self._match_entry(entry, incomplete=True) if reason is not None: self.to_screen('[download] ' + reason) continue entry_result = self.process_ie_result(entry, download=download, extra_info=extra) playlist_results.append(entry_result) ie_result['entries'] = playlist_results self.to_screen('[download] Finished downloading playlist: %s' % playlist) return ie_result elif result_type == 'compat_list': self.report_warning( 'Extractor %s returned a compat_list result. ' 'It needs to be updated.' % ie_result.get('extractor')) def _fixup(r): self.add_extra_info( r, { 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], 'webpage_url_basename': url_basename(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], } ) return r ie_result['entries'] = [ self.process_ie_result(_fixup(r), download, extra_info) for r in ie_result['entries'] ] return ie_result else: raise Exception('Invalid result type: %s' % result_type) def _build_format_filter(self, filter_spec): " Returns a function to filter the formats according to the filter_spec " OPERATORS = { '<': operator.lt, '<=': operator.le, '>': operator.gt, '>=': operator.ge, '=': operator.eq, '!=': operator.ne, } operator_rex = re.compile(r'''(?x)\s* (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps) \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) $ ''' % '|'.join(map(re.escape, OPERATORS.keys()))) m = operator_rex.search(filter_spec) if m: try: comparison_value = int(m.group('value')) except ValueError: comparison_value = parse_filesize(m.group('value')) if comparison_value is None: comparison_value = parse_filesize(m.group('value') + 'B') if comparison_value is None: raise ValueError( 'Invalid value %r in format specification %r' % ( m.group('value'), filter_spec)) op = OPERATORS[m.group('op')] if not m: STR_OPERATORS = { '=': operator.eq, '!=': operator.ne, '^=': lambda attr, value: attr.startswith(value), '$=': lambda attr, value: attr.endswith(value), '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) \s*(?P<key>ext|acodec|vcodec|container|protocol) \s*(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<value>[a-zA-Z0-9._-]+) \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) m = str_operator_rex.search(filter_spec) if m: comparison_value = m.group('value') op = STR_OPERATORS[m.group('op')] if not m: raise ValueError('Invalid filter specification %r' % filter_spec) def _filter(f): actual_value = f.get(m.group('key')) if actual_value is None: return m.group('none_inclusive') return op(actual_value, comparison_value) return _filter def build_format_selector(self, format_spec): def syntax_error(note, start): message = ( 'Invalid format specification: ' '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) return SyntaxError(message) PICKFIRST = 'PICKFIRST' MERGE = 'MERGE' SINGLE = 'SINGLE' GROUP = 'GROUP' FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) def _parse_filter(tokens): filter_parts = [] for type, string, start, _, _ in tokens: if type == tokenize.OP and string == ']': return ''.join(filter_parts) else: filter_parts.append(string) def _remove_unused_ops(tokens): # Remove operators that we don't use and join them with the surrounding strings # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' ALLOWED_OPS = ('/', '+', ',', '(', ')') last_string, last_start, last_end, last_line = None, None, None, None for type, string, start, end, line in tokens: if type == tokenize.OP and string == '[': if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line last_string = None yield type, string, start, end, line # everything inside brackets will be handled by _parse_filter for type, string, start, end, line in tokens: yield type, string, start, end, line if type == tokenize.OP and string == ']': break elif type == tokenize.OP and string in ALLOWED_OPS: if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line last_string = None yield type, string, start, end, line elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: if not last_string: last_string = string last_start = start last_end = end else: last_string += string if last_string: yield tokenize.NAME, last_string, last_start, last_end, last_line def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): selectors = [] current_selector = None for type, string, start, _, _ in tokens: # ENCODING is only defined in python 3.x if type == getattr(tokenize, 'ENCODING', None): continue elif type in [tokenize.NAME, tokenize.NUMBER]: current_selector = FormatSelector(SINGLE, string, []) elif type == tokenize.OP: if string == ')': if not inside_group: # ')' will be handled by the parentheses group tokens.restore_last_token() break elif inside_merge and string in ['/', ',']: tokens.restore_last_token() break elif inside_choice and string == ',': tokens.restore_last_token() break elif string == ',': if not current_selector: raise syntax_error('"," must follow a format selector', start) selectors.append(current_selector) current_selector = None elif string == '/': if not current_selector: raise syntax_error('"/" must follow a format selector', start) first_choice = current_selector second_choice = _parse_format_selection(tokens, inside_choice=True) current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) elif string == '[': if not current_selector: current_selector = FormatSelector(SINGLE, 'best', []) format_filter = _parse_filter(tokens) current_selector.filters.append(format_filter) elif string == '(': if current_selector: raise syntax_error('Unexpected "("', start) group = _parse_format_selection(tokens, inside_group=True) current_selector = FormatSelector(GROUP, group, []) elif string == '+': video_selector = current_selector audio_selector = _parse_format_selection(tokens, inside_merge=True) if not video_selector or not audio_selector: raise syntax_error('"+" must be between two format selectors', start) current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) else: raise syntax_error('Operator not recognized: "{0}"'.format(string), start) elif type == tokenize.ENDMARKER: break if current_selector: selectors.append(current_selector) return selectors def _build_selector_function(selector): if isinstance(selector, list): fs = [_build_selector_function(s) for s in selector] def selector_function(formats): for f in fs: for format in f(formats): yield format return selector_function elif selector.type == GROUP: selector_function = _build_selector_function(selector.selector) elif selector.type == PICKFIRST: fs = [_build_selector_function(s) for s in selector.selector] def selector_function(formats): for f in fs: picked_formats = list(f(formats)) if picked_formats: return picked_formats return [] elif selector.type == SINGLE: format_spec = selector.selector def selector_function(formats): formats = list(formats) if not formats: return if format_spec == 'all': for f in formats: yield f elif format_spec in ['best', 'worst', None]: format_idx = 0 if format_spec == 'worst' else -1 audiovideo_formats = [ f for f in formats if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] if audiovideo_formats: yield audiovideo_formats[format_idx] # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format elif (all(f.get('acodec') != 'none' for f in formats) or all(f.get('vcodec') != 'none' for f in formats)): yield formats[format_idx] elif format_spec == 'bestaudio': audio_formats = [ f for f in formats if f.get('vcodec') == 'none'] if audio_formats: yield audio_formats[-1] elif format_spec == 'worstaudio': audio_formats = [ f for f in formats if f.get('vcodec') == 'none'] if audio_formats: yield audio_formats[0] elif format_spec == 'bestvideo': video_formats = [ f for f in formats if f.get('acodec') == 'none'] if video_formats: yield video_formats[-1] elif format_spec == 'worstvideo': video_formats = [ f for f in formats if f.get('acodec') == 'none'] if video_formats: yield video_formats[0] else: extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] if format_spec in extensions: filter_f = lambda f: f['ext'] == format_spec else: filter_f = lambda f: f['format_id'] == format_spec matches = list(filter(filter_f, formats)) if matches: yield matches[-1] elif selector.type == MERGE: def _merge(formats_info): format_1, format_2 = [f['format_id'] for f in formats_info] # The first format must contain the video and the # second the audio if formats_info[0].get('vcodec') == 'none': self.report_error('The first format must ' 'contain the video, try using ' '"-f %s+%s"' % (format_2, format_1)) return # Formats must be opposite (video+audio) if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none': self.report_error( 'Both formats %s and %s are video-only, you must specify "-f video+audio"' % (format_1, format_2)) return output_ext = ( formats_info[0]['ext'] if self.params.get('merge_output_format') is None else self.params['merge_output_format']) return { 'requested_formats': formats_info, 'format': '%s+%s' % (formats_info[0].get('format'), formats_info[1].get('format')), 'format_id': '%s+%s' % (formats_info[0].get('format_id'), formats_info[1].get('format_id')), 'width': formats_info[0].get('width'), 'height': formats_info[0].get('height'), 'resolution': formats_info[0].get('resolution'), 'fps': formats_info[0].get('fps'), 'vcodec': formats_info[0].get('vcodec'), 'vbr': formats_info[0].get('vbr'), 'stretched_ratio': formats_info[0].get('stretched_ratio'), 'acodec': formats_info[1].get('acodec'), 'abr': formats_info[1].get('abr'), 'ext': output_ext, } video_selector, audio_selector = map(_build_selector_function, selector.selector) def selector_function(formats): formats = list(formats) for pair in itertools.product(video_selector(formats), audio_selector(formats)): yield _merge(pair) filters = [self._build_format_filter(f) for f in selector.filters] def final_selector(formats): for _filter in filters: formats = list(filter(_filter, formats)) return selector_function(formats) return final_selector stream = io.BytesIO(format_spec.encode('utf-8')) try: tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) except tokenize.TokenError: raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) class TokenIterator(object): def __init__(self, tokens): self.tokens = tokens self.counter = 0 def __iter__(self): return self def __next__(self): if self.counter >= len(self.tokens): raise StopIteration() value = self.tokens[self.counter] self.counter += 1 return value next = __next__ def restore_last_token(self): self.counter -= 1 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): res = std_headers.copy() add_headers = info_dict.get('http_headers') if add_headers: res.update(add_headers) cookies = self._calc_cookies(info_dict) if cookies: res['Cookie'] = cookies return res def _calc_cookies(self, info_dict): pr = sanitized_Request(info_dict['url']) self.cookiejar.add_cookie_header(pr) return pr.get_header('Cookie') def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' if 'id' not in info_dict: raise ExtractorError('Missing "id" field in extractor result') if 'title' not in info_dict: raise ExtractorError('Missing "title" field in extractor result') if 'playlist' not in info_dict: # It isn't part of a playlist info_dict['playlist'] = None info_dict['playlist_index'] = None thumbnails = info_dict.get('thumbnails') if thumbnails is None: thumbnail = info_dict.get('thumbnail') if thumbnail: info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] if thumbnails: thumbnails.sort(key=lambda t: ( t.get('preference'), t.get('width'), t.get('height'), t.get('id'), t.get('url'))) for i, t in enumerate(thumbnails): if t.get('width') and t.get('height'): t['resolution'] = '%dx%d' % (t['width'], t['height']) if t.get('id') is None: t['id'] = '%d' % i if thumbnails and 'thumbnail' not in info_dict: info_dict['thumbnail'] = thumbnails[-1]['url'] if 'display_id' not in info_dict and 'id' in info_dict: info_dict['display_id'] = info_dict['id'] if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: # Working around out-of-range timestamp values (e.g. negative ones on Windows, # see http://bugs.python.org/issue1646728) try: upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) info_dict['upload_date'] = upload_date.strftime('%Y%m%d') except (ValueError, OverflowError, OSError): pass # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. for field in ('chapter', 'season', 'episode'): if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) subtitles = info_dict.get('subtitles') if subtitles: for _, subtitle in subtitles.items(): for subtitle_format in subtitle: if 'ext' not in subtitle_format: subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') self.list_subtitles(info_dict['id'], subtitles, 'subtitles') return info_dict['requested_subtitles'] = self.process_subtitles( info_dict['id'], subtitles, info_dict.get('automatic_captions')) # We now pick which formats have to be downloaded if info_dict.get('formats') is None: # There's only one format available formats = [info_dict] else: formats = info_dict['formats'] if not formats: raise ExtractorError('No video formats found!') formats_dict = {} # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): if 'url' not in format: raise ExtractorError('Missing "url" key in result (index %d)' % i) if format.get('format_id') is None: format['format_id'] = compat_str(i) else: # Sanitize format_id from characters used in format selector expression format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id']) format_id = format['format_id'] if format_id not in formats_dict: formats_dict[format_id] = [] formats_dict[format_id].append(format) # Make sure all formats have unique format_id for format_id, ambiguous_formats in formats_dict.items(): if len(ambiguous_formats) > 1: for i, format in enumerate(ambiguous_formats): format['format_id'] = '%s-%d' % (format_id, i) for i, format in enumerate(formats): if format.get('format') is None: format['format'] = '{id} - {res}{note}'.format( id=format['format_id'], res=self.format_resolution(format), note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', ) # Automatically determine file extension if missing if 'ext' not in format: format['ext'] = determine_ext(format['url']).lower() # Automatically determine protocol if missing (useful for format # selection purposes) if 'protocol' not in format: format['protocol'] = determine_protocol(format) # Add HTTP headers, so that external programs can use them from the # json output full_format_info = info_dict.copy() full_format_info.update(format) format['http_headers'] = self._calc_headers(full_format_info) # TODO Central sorting goes here if formats[0] is not info_dict: # only set the 'formats' fields if the original info_dict list them # otherwise we end up with a circular reference, the first (and unique) # element in the 'formats' field in info_dict is info_dict itself, # which can't be exported to json info_dict['formats'] = formats if self.params.get('listformats'): self.list_formats(info_dict) return if self.params.get('list_thumbnails'): self.list_thumbnails(info_dict) return req_format = self.params.get('format') if req_format is None: req_format_list = [] if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and not info_dict.get('is_live')): merger = FFmpegMergerPP(self) if merger.available and merger.can_merge(): req_format_list.append('bestvideo+bestaudio') req_format_list.append('best') req_format = '/'.join(req_format_list) format_selector = self.build_format_selector(req_format) formats_to_download = list(format_selector(formats)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) if download: if len(formats_to_download) > 1: self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download))) for format in formats_to_download: new_info = dict(info_dict) new_info.update(format) self.process_info(new_info) # We update the info dict with the best quality format (backwards compatibility) info_dict.update(formats_to_download[-1]) return info_dict def process_subtitles(self, video_id, normal_subtitles, automatic_captions): """Select the requested subtitles and their format""" available_subs = {} if normal_subtitles and self.params.get('writesubtitles'): available_subs.update(normal_subtitles) if automatic_captions and self.params.get('writeautomaticsub'): for lang, cap_info in automatic_captions.items(): if lang not in available_subs: available_subs[lang] = cap_info if (not self.params.get('writesubtitles') and not self.params.get('writeautomaticsub') or not available_subs): return None if self.params.get('allsubtitles', False): requested_langs = available_subs.keys() else: if self.params.get('subtitleslangs', False): requested_langs = self.params.get('subtitleslangs') elif 'en' in available_subs: requested_langs = ['en'] else: requested_langs = [list(available_subs.keys())[0]] formats_query = self.params.get('subtitlesformat', 'best') formats_preference = formats_query.split('/') if formats_query else [] subs = {} for lang in requested_langs: formats = available_subs.get(lang) if formats is None: self.report_warning('%s subtitles not available for %s' % (lang, video_id)) continue for ext in formats_preference: if ext == 'best': f = formats[-1] break matches = list(filter(lambda f: f['ext'] == ext, formats)) if matches: f = matches[-1] break else: f = formats[-1] self.report_warning( 'No subtitle format found matching "%s" for language %s, ' 'using %s' % (formats_query, lang, f['ext'])) subs[lang] = f return subs def process_info(self, info_dict): """Process a single resolved IE result.""" assert info_dict.get('_type', 'video') == 'video' max_downloads = self.params.get('max_downloads') if max_downloads is not None: if self._num_downloads >= int(max_downloads): raise MaxDownloadsReached() info_dict['fulltitle'] = info_dict['title'] if len(info_dict['title']) > 200: info_dict['title'] = info_dict['title'][:197] + '...' if 'format' not in info_dict: info_dict['format'] = info_dict['ext'] reason = self._match_entry(info_dict, incomplete=False) if reason is not None: self.to_screen('[download] ' + reason) return self._num_downloads += 1 info_dict['_filename'] = filename = self.prepare_filename(info_dict) # Forced printings if self.params.get('forcetitle', False): self.to_stdout(info_dict['fulltitle']) if self.params.get('forceid', False): self.to_stdout(info_dict['id']) if self.params.get('forceurl', False): if info_dict.get('requested_formats') is not None: for f in info_dict['requested_formats']: self.to_stdout(f['url'] + f.get('play_path', '')) else: # For RTMP URLs, also include the playpath self.to_stdout(info_dict['url'] + info_dict.get('play_path', '')) if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None: self.to_stdout(info_dict['thumbnail']) if self.params.get('forcedescription', False) and info_dict.get('description') is not None: self.to_stdout(info_dict['description']) if self.params.get('forcefilename', False) and filename is not None: self.to_stdout(filename) if self.params.get('forceduration', False) and info_dict.get('duration') is not None: self.to_stdout(formatSeconds(info_dict['duration'])) if self.params.get('forceformat', False): self.to_stdout(info_dict['format']) if self.params.get('forcejson', False): self.to_stdout(json.dumps(info_dict)) # Do nothing else if in simulate mode if self.params.get('simulate', False): return if filename is None: return try: dn = os.path.dirname(sanitize_path(encodeFilename(filename))) if dn and not os.path.exists(dn): os.makedirs(dn) except (OSError, IOError) as err: self.report_error('unable to create directory ' + error_to_compat_str(err)) return if self.params.get('writedescription', False): descfn = replace_extension(filename, 'description', info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)): self.to_screen('[info] Video description is already present') elif info_dict.get('description') is None: self.report_warning('There\'s no description to write.') else: try: self.to_screen('[info] Writing video description to: ' + descfn) with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: descfile.write(info_dict['description']) except (OSError, IOError): self.report_error('Cannot write description file ' + descfn) return if self.params.get('writeannotations', False): annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)): self.to_screen('[info] Video annotations are already present') else: try: self.to_screen('[info] Writing video annotations to: ' + annofn) with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: annofile.write(info_dict['annotations']) except (KeyError, TypeError): self.report_warning('There are no annotations to write.') except (OSError, IOError): self.report_error('Cannot write annotations file: ' + annofn) return subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) if subtitles_are_requested and info_dict.get('requested_subtitles'): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['requested_subtitles'] ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] if sub_info.get('data') is not None: sub_data = sub_info['data'] else: try: sub_data = ie._download_webpage( sub_info['url'], info_dict['id'], note=False) except ExtractorError as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err.cause))) continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return if self.params.get('writeinfojson', False): infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): self.to_screen('[info] Video description metadata is already present') else: self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn) try: write_json_file(self.filter_requested_info(info_dict), infofn) except (OSError, IOError): self.report_error('Cannot write metadata to JSON file ' + infofn) return self._write_thumbnails(info_dict, filename) if not self.params.get('skip_download', False): try: def dl(name, info): fd = get_suitable_downloader(info, self.params)(self, self.params) for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) return fd.download(name, info) if info_dict.get('requested_formats') is not None: downloaded = [] success = True merger = FFmpegMergerPP(self) if not merger.available: postprocessors = [] self.report_warning('You have requested multiple ' 'formats but ffmpeg or avconv are not installed.' ' The formats won\'t be merged.') else: postprocessors = [merger] def compatible_formats(formats): video, audio = formats # Check extension video_ext, audio_ext = audio.get('ext'), video.get('ext') if video_ext and audio_ext: COMPATIBLE_EXTS = ( ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'), ('webm') ) for exts in COMPATIBLE_EXTS: if video_ext in exts and audio_ext in exts: return True # TODO: Check acodec/vcodec return False filename_real_ext = os.path.splitext(filename)[1][1:] filename_wo_ext = ( os.path.splitext(filename)[0] if filename_real_ext == info_dict['ext'] else filename) requested_formats = info_dict['requested_formats'] if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats): info_dict['ext'] = 'mkv' self.report_warning( 'Requested formats are incompatible for merge and will be merged into mkv.') # Ensure filename always has a correct extension for successful merge filename = '%s.%s' % (filename_wo_ext, info_dict['ext']) if os.path.exists(encodeFilename(filename)): self.to_screen( '[download] %s has already been downloaded and ' 'merged' % filename) else: for f in requested_formats: new_info = dict(info_dict) new_info.update(f) fname = self.prepare_filename(new_info) fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext']) downloaded.append(fname) partial_success = dl(fname, new_info) success = success and partial_success info_dict['__postprocessors'] = postprocessors info_dict['__files_to_merge'] = downloaded else: # Just a single file success = dl(filename, info_dict) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_error('unable to download video data: %s' % str(err)) return except (OSError, IOError) as err: raise UnavailableVideoError(err) except (ContentTooShortError, ) as err: self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) return if success: # Fixup content fixup_policy = self.params.get('fixup') if fixup_policy is None: fixup_policy = 'detect_or_warn' stretched_ratio = info_dict.get('stretched_ratio') if stretched_ratio is not None and stretched_ratio != 1: if fixup_policy == 'warn': self.report_warning('%s: Non-uniform pixel ratio (%s)' % ( info_dict['id'], stretched_ratio)) elif fixup_policy == 'detect_or_warn': stretched_pp = FFmpegFixupStretchedPP(self) if stretched_pp.available: info_dict.setdefault('__postprocessors', []) info_dict['__postprocessors'].append(stretched_pp) else: self.report_warning( '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % ( info_dict['id'], stretched_ratio)) else: assert fixup_policy in ('ignore', 'never') if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash': if fixup_policy == 'warn': self.report_warning('%s: writing DASH m4a. Only some players support this container.' % ( info_dict['id'])) elif fixup_policy == 'detect_or_warn': fixup_pp = FFmpegFixupM4aPP(self) if fixup_pp.available: info_dict.setdefault('__postprocessors', []) info_dict['__postprocessors'].append(fixup_pp) else: self.report_warning( '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % ( info_dict['id'])) else: assert fixup_policy in ('ignore', 'never') try: self.post_process(filename, info_dict) except (PostProcessingError) as err: self.report_error('postprocessing: %s' % str(err)) return self.record_download_archive(info_dict) def download(self, url_list): """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and '%' not in outtmpl and self.params.get('max_downloads') != 1): raise SameFileError(outtmpl) for url in url_list: try: # It also downloads the videos res = self.extract_info( url, force_generic_extractor=self.params.get('force_generic_extractor', False)) except UnavailableVideoError: self.report_error('unable to download video') except MaxDownloadsReached: self.to_screen('[info] Maximum number of downloaded files reached.') raise else: if self.params.get('dump_single_json', False): self.to_stdout(json.dumps(res)) return self._download_retcode def download_with_info_file(self, info_filename): with contextlib.closing(fileinput.FileInput( [info_filename], mode='r', openhook=fileinput.hook_encoded('utf-8'))) as f: # FileInput doesn't have a read method, we can't call json.load info = self.filter_requested_info(json.loads('\n'.join(f))) try: self.process_ie_result(info, download=True) except DownloadError: webpage_url = info.get('webpage_url') if webpage_url is not None: self.report_warning('The info failed to download, trying with "%s"' % webpage_url) return self.download([webpage_url]) else: raise return self._download_retcode @staticmethod def filter_requested_info(info_dict): return dict( (k, v) for k, v in info_dict.items() if k not in ['requested_formats', 'requested_subtitles']) def post_process(self, filename, ie_info): """Run all the postprocessors on the given file.""" info = dict(ie_info) info['filepath'] = filename pps_chain = [] if ie_info.get('__postprocessors') is not None: pps_chain.extend(ie_info['__postprocessors']) pps_chain.extend(self._pps) for pp in pps_chain: files_to_delete = [] try: files_to_delete, info = pp.run(info) except PostProcessingError as e: self.report_error(e.msg) if files_to_delete and not self.params.get('keepvideo', False): for old_filename in files_to_delete: self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) try: os.remove(encodeFilename(old_filename)) except (IOError, OSError): self.report_warning('Unable to remove downloaded original file') def _make_archive_id(self, info_dict): # Future-proof against any change in case # and backwards compatibility with prior versions extractor = info_dict.get('extractor_key') if extractor is None: if 'id' in info_dict: extractor = info_dict.get('ie_key') # key in a playlist if extractor is None: return None # Incomplete video information return extractor.lower() + ' ' + info_dict['id'] def in_download_archive(self, info_dict): fn = self.params.get('download_archive') if fn is None: return False vid_id = self._make_archive_id(info_dict) if vid_id is None: return False # Incomplete video information try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: if line.strip() == vid_id: return True except IOError as ioe: if ioe.errno != errno.ENOENT: raise return False def record_download_archive(self, info_dict): fn = self.params.get('download_archive') if fn is None: return vid_id = self._make_archive_id(info_dict) assert vid_id with locked_file(fn, 'a', encoding='utf-8') as archive_file: archive_file.write(vid_id + '\n') @staticmethod def format_resolution(format, default='unknown'): if format.get('vcodec') == 'none': return 'audio only' if format.get('resolution') is not None: return format['resolution'] if format.get('height') is not None: if format.get('width') is not None: res = '%sx%s' % (format['width'], format['height']) else: res = '%sp' % format['height'] elif format.get('width') is not None: res = '%dx?' % format['width'] else: res = default return res def _format_note(self, fdict): res = '' if fdict.get('ext') in ['f4f', 'f4m']: res += '(unsupported) ' if fdict.get('language'): if res: res += ' ' res += '[%s]' % fdict['language'] if fdict.get('format_note') is not None: res += fdict['format_note'] + ' ' if fdict.get('tbr') is not None: res += '%4dk ' % fdict['tbr'] if fdict.get('container') is not None: if res: res += ', ' res += '%s container' % fdict['container'] if (fdict.get('vcodec') is not None and fdict.get('vcodec') != 'none'): if res: res += ', ' res += fdict['vcodec'] if fdict.get('vbr') is not None: res += '@' elif fdict.get('vbr') is not None and fdict.get('abr') is not None: res += 'video@' if fdict.get('vbr') is not None: res += '%4dk' % fdict['vbr'] if fdict.get('fps') is not None: res += ', %sfps' % fdict['fps'] if fdict.get('acodec') is not None: if res: res += ', ' if fdict['acodec'] == 'none': res += 'video only' else: res += '%-5s' % fdict['acodec'] elif fdict.get('abr') is not None: if res: res += ', ' res += 'audio' if fdict.get('abr') is not None: res += '@%3dk' % fdict['abr'] if fdict.get('asr') is not None: res += ' (%5dHz)' % fdict['asr'] if fdict.get('filesize') is not None: if res: res += ', ' res += format_bytes(fdict['filesize']) elif fdict.get('filesize_approx') is not None: if res: res += ', ' res += '~' + format_bytes(fdict['filesize_approx']) return res def list_formats(self, info_dict): formats = info_dict.get('formats', [info_dict]) table = [ [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)] for f in formats if f.get('preference') is None or f['preference'] >= -1000] if len(formats) > 1: table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)' header_line = ['format code', 'extension', 'resolution', 'note'] self.to_screen( '[info] Available formats for %s:\n%s' % (info_dict['id'], render_table(header_line, table))) def list_thumbnails(self, info_dict): thumbnails = info_dict.get('thumbnails') if not thumbnails: tn_url = info_dict.get('thumbnail') if tn_url: thumbnails = [{'id': '0', 'url': tn_url}] else: self.to_screen( '[info] No thumbnails present for %s' % info_dict['id']) return self.to_screen( '[info] Thumbnails for %s:' % info_dict['id']) self.to_screen(render_table( ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) def list_subtitles(self, video_id, subtitles, name='subtitles'): if not subtitles: self.to_screen('%s has no %s' % (video_id, name)) return self.to_screen( 'Available %s for %s:' % (name, video_id)) self.to_screen(render_table( ['Language', 'formats'], [[lang, ', '.join(f['ext'] for f in reversed(formats))] for lang, formats in subtitles.items()])) def urlopen(self, req): """ Start an HTTP download """ if isinstance(req, compat_basestring): req = sanitized_Request(req) return self._opener.open(req, timeout=self._socket_timeout) def print_debug_header(self): if not self.params.get('verbose'): return if type('') is not compat_str: # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326) self.report_warning( 'Your Python is broken! Update to a newer and supported version') stdout_encoding = getattr( sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__) encoding_str = ( '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % ( locale.getpreferredencoding(), sys.getfilesystemencoding(), stdout_encoding, self.get_encoding())) write_string(encoding_str, encoding=None) self._write_string('[debug] youtube-dl version ' + __version__ + '\n') try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(__file__))) out, err = sp.communicate() out = out.decode().strip() if re.match('[0-9a-f]+', out): self._write_string('[debug] Git HEAD: ' + out + '\n') except Exception: try: sys.exc_clear() except Exception: pass self._write_string('[debug] Python version %s - %s\n' % ( platform.python_version(), platform_name())) exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() exe_str = ', '.join( '%s %s' % (exe, v) for exe, v in sorted(exe_versions.items()) if v ) if not exe_str: exe_str = 'none' self._write_string('[debug] exe versions: %s\n' % exe_str) proxy_map = {} for handler in self._opener.handlers: if hasattr(handler, 'proxies'): proxy_map.update(handler.proxies) self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') if self.params.get('call_home', False): ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') self._write_string('[debug] Public IP address: %s\n' % ipaddr) latest_version = self.urlopen( 'https://yt-dl.org/latest/version').read().decode('utf-8') if version_tuple(latest_version) > version_tuple(__version__): self.report_warning( 'You are using an outdated version (newest version: %s)! ' 'See https://yt-dl.org/update if you need help updating.' % latest_version) def _setup_opener(self): timeout_val = self.params.get('socket_timeout') self._socket_timeout = 600 if timeout_val is None else float(timeout_val) opts_cookiefile = self.params.get('cookiefile') opts_proxy = self.params.get('proxy') if opts_cookiefile is None: self.cookiejar = compat_cookiejar.CookieJar() else: self.cookiejar = compat_cookiejar.MozillaCookieJar( opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): self.cookiejar.load() cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: if opts_proxy == '': proxies = {} else: proxies = {'http': opts_proxy, 'https': opts_proxy} else: proxies = compat_urllib_request.getproxies() # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) if 'http' in proxies and 'https' not in proxies: proxies['https'] = proxies['http'] proxy_handler = PerRequestProxyHandler(proxies) debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) data_handler = compat_urllib_request_DataHandler() # When passing our own FileHandler instance, build_opener won't add the # default FileHandler and allows us to disable the file protocol, which # can be used for malicious purposes (see # https://github.com/rg3/youtube-dl/issues/8227) file_handler = compat_urllib_request.FileHandler() def file_open(*args, **kwargs): raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons') file_handler.file_open = file_open opener = compat_urllib_request.build_opener( proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play # (See https://github.com/rg3/youtube-dl/issues/1309 for details) opener.addheaders = [] self._opener = opener def encode(self, s): if isinstance(s, bytes): return s # Already encoded try: return s.encode(self.get_encoding()) except UnicodeEncodeError as err: err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.' raise def get_encoding(self): encoding = self.params.get('encoding') if encoding is None: encoding = preferredencoding() return encoding def _write_thumbnails(self, info_dict, filename): if self.params.get('writethumbnail', False): thumbnails = info_dict.get('thumbnails') if thumbnails: thumbnails = [thumbnails[-1]] elif self.params.get('write_all_thumbnails', False): thumbnails = info_dict.get('thumbnails') else: return if not thumbnails: # No thumbnails present, so return immediately return for t in thumbnails: thumb_ext = determine_ext(t['url'], 'jpg') suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): self.to_screen('[%s] %s: Thumbnail %sis already present' % (info_dict['extractor'], info_dict['id'], thumb_display_id)) else: self.to_screen('[%s] %s: Downloading thumbnail %s...' % (info_dict['extractor'], info_dict['id'], thumb_display_id)) try: uf = self.urlopen(t['url']) with open(encodeFilename(thumb_filename), 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) self.to_screen('[%s] %s: Writing thumbnail %sto: %s' % (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_warning('Unable to download thumbnail "%s": %s' % (t['url'], error_to_compat_str(err))) �������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/��������������������������������������������������������������������0000755�0000000�0000000�00000000000�12662564617�016052� 5����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/ninegag.py����������������������������������������������������������0000644�0000000�0000000�00000007166�12641030331�020020� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import str_to_int class NineGagIE(InfoExtractor): IE_NAME = '9gag' _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P<id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^?#/]+))?' _TESTS = [{ 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome', 'info_dict': { 'id': 'Kk2X5', 'ext': 'mp4', 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)', 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome', 'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA', 'uploader': 'CompilationChannel', 'upload_date': '20131110', 'view_count': int, }, 'add_ie': ['Youtube'], }, { 'url': 'http://9gag.com/tv/p/aKolP3', 'info_dict': { 'id': 'aKolP3', 'ext': 'mp4', 'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video', 'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!", 'uploader_id': 'rickmereki', 'uploader': 'Rick Mereki', 'upload_date': '20110803', 'view_count': int, }, 'add_ie': ['Vimeo'], }, { 'url': 'http://9gag.com/tv/p/KklwM', 'only_matching': True, }, { 'url': 'http://9gag.tv/p/Kk2X5', 'only_matching': True, }, { 'url': 'http://9gag.com/tv/embed/a5Dmvl', 'only_matching': True, }] _EXTERNAL_VIDEO_PROVIDER = { '1': { 'url': '%s', 'ie_key': 'Youtube', }, '2': { 'url': 'http://player.vimeo.com/video/%s', 'ie_key': 'Vimeo', }, '3': { 'url': 'http://instagram.com/p/%s', 'ie_key': 'Instagram', }, '4': { 'url': 'http://vine.co/v/%s', 'ie_key': 'Vine', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id webpage = self._download_webpage(url, display_id) post_view = self._parse_json( self._search_regex( r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost', webpage, 'post view'), display_id) ie_key = None source_url = post_view.get('sourceUrl') if not source_url: external_video_id = post_view['videoExternalId'] external_video_provider = post_view['videoExternalProvider'] source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key'] title = post_view['title'] description = post_view.get('description') view_count = str_to_int(post_view.get('externalView')) thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w') return { '_type': 'url_transparent', 'url': source_url, 'ie_key': ie_key, 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'view_count': view_count, 'thumbnail': thumbnail, } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/bandcamp.py���������������������������������������������������������0000644�0000000�0000000�00000015364�12641030331�020154� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..compat import ( compat_str, compat_urlparse, ) from ..utils import ( ExtractorError, float_or_none, int_or_none, ) class BandcampIE(InfoExtractor): _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { 'id': '1812978515', 'ext': 'mp3', 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', 'md5': '2b68e5851514c20efdff2afc5603b8b4', 'info_dict': { 'id': '2650410135', 'ext': 'mp3', 'title': 'Lanius (Battle)', 'uploader': 'Ben Prunty Music', }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if not m_download: m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) if m_trackinfo: json_code = m_trackinfo.group(1) data = json.loads(json_code)[0] formats = [] for format_id, format_url in data['file'].items(): ext, abr_str = format_id.split('-', 1) formats.append({ 'format_id': format_id, 'url': self._proto_relative_url(format_url, 'http:'), 'ext': ext, 'vcodec': 'none', 'acodec': ext, 'abr': int_or_none(abr_str), }) self._sort_formats(formats) return { 'id': compat_str(data['id']), 'title': data['title'], 'formats': formats, 'duration': float_or_none(data.get('duration')), } else: raise ExtractorError('No free songs found') download_link = m_download.group(1) video_id = self._search_regex( r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', webpage, 'video id') download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') # We get the dictionary of the track from some javascript code all_info = self._parse_json(self._search_regex( r'(?sm)items: (.*?),$', download_webpage, 'items'), video_id) info = all_info[0] # We pick mp3-320 for now, until format selection can be easily implemented. mp3_info = info['downloads']['mp3-320'] # If we try to use this url it says the link has expired initial_url = mp3_info['url'] m_url = re.match( r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$', initial_url) # We build the url we will use to get the final track url # This url is build in Bandcamp in the script download_bunde_*.js request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts')) final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url') # If we could correctly generate the .rand field the url would be # in the "download_url" key final_url = self._proto_relative_url(self._search_regex( r'"retry_url":"(.+?)"', final_url_webpage, 'final video URL'), 'http:') return { 'id': video_id, 'title': info['title'], 'ext': 'mp3', 'vcodec': 'none', 'url': final_url, 'thumbnail': info.get('thumb_url'), 'uploader': info.get('artist'), } class BandcampAlbumIE(InfoExtractor): IE_NAME = 'Bandcamp:album' _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', 'playlist': [ { 'md5': '39bc1eded3476e927c724321ddf116cf', 'info_dict': { 'id': '1353101989', 'ext': 'mp3', 'title': 'Intro', } }, { 'md5': '1a2c32e2691474643e912cc6cd4bffaa', 'info_dict': { 'id': '38097443', 'ext': 'mp3', 'title': 'Kero One - Keep It Alive (Blazo remix)', } }, ], 'info_dict': { 'title': 'Jazz Format Mixtape vol.1', 'id': 'jazz-format-mixtape-vol-1', 'uploader_id': 'blazo', }, 'params': { 'playlistend': 2 }, 'skip': 'Bandcamp imposes download limits.' }, { 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave', 'info_dict': { 'title': 'Hierophany of the Open Grave', 'uploader_id': 'nightbringer', 'id': 'hierophany-of-the-open-grave', }, 'playlist_mincount': 9, }, { 'url': 'http://dotscale.bandcamp.com', 'info_dict': { 'title': 'Loom', 'id': 'dotscale', 'uploader_id': 'dotscale', }, 'playlist_mincount': 7, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader_id = mobj.group('subdomain') album_id = mobj.group('album_id') playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage) if not tracks_paths: raise ExtractorError('The page doesn\'t contain any tracks') entries = [ self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) for t_path in tracks_paths] title = self._search_regex( r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False) return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, 'title': title, 'entries': entries, } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/cinemassacre.py�����������������������������������������������������0000644�0000000�0000000�00000010424�12641030331�021034� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ExtractorError from .screenwavemedia import ScreenwaveMediaIE class CinemassacreIE(InfoExtractor): _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)' _TESTS = [ { 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', 'md5': 'fde81fbafaee331785f58cd6c0d46190', 'info_dict': { 'id': 'Cinemassacre-19911', 'ext': 'mp4', 'upload_date': '20121110', 'title': '“Angry Video Game Nerd: The Movie” – Trailer', 'description': 'md5:fb87405fcb42a331742a0dce2708560b', }, }, { 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', 'md5': 'd72f10cd39eac4215048f62ab477a511', 'info_dict': { 'id': 'Cinemassacre-521be8ef82b16', 'ext': 'mp4', 'upload_date': '20131002', 'title': 'The Mummy’s Hand (1940)', }, }, { # Youtube embedded video 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', 'md5': 'df4cf8a1dcedaec79a73d96d83b99023', 'info_dict': { 'id': 'OEVzPCY2T-g', 'ext': 'mp4', 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', 'upload_date': '20061207', 'uploader': 'Cinemassacre', 'uploader_id': 'JamesNintendoNerd', 'description': 'md5:784734696c2b8b7f4b8625cc799e07f6', } }, { # Youtube embedded video 'url': 'http://cinemassacre.com/2006/09/01/mckids/', 'md5': '6eb30961fa795fedc750eac4881ad2e1', 'info_dict': { 'id': 'FnxsNhuikpo', 'ext': 'mp4', 'upload_date': '20060901', 'uploader': 'Cinemassacre Extras', 'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53', 'uploader_id': 'Cinemassacre', 'title': 'AVGN: McKids', } }, { 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', 'md5': '1376908e49572389e7b06251a53cdd08', 'info_dict': { 'id': 'Cinemassacre-555779690c440', 'ext': 'mp4', 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', 'upload_date': '20150525', } } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('display_id') video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d') webpage = self._download_webpage(url, display_id) playerdata_url = self._search_regex( [ ScreenwaveMediaIE.EMBED_PATTERN, r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', ], webpage, 'player data URL', default=None, group='url') if not playerdata_url: raise ExtractorError('Unable to find player data') video_title = self._html_search_regex( r'<title>(?P<title>.+?)\|', webpage, 'title') video_description = self._html_search_regex( r'<div class="entry-content">(?P<description>.+?)</div>', webpage, 'description', flags=re.DOTALL, fatal=False) video_thumbnail = self._og_search_thumbnail(webpage) return { '_type': 'url_transparent', 'display_id': display_id, 'title': video_title, 'description': video_description, 'upload_date': video_date, 'thumbnail': video_thumbnail, 'url': playerdata_url, } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/videopremium.py�����������������������������������������������������0000644�0000000�0000000�00000003004�12660177411�021114� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re import random from .common import InfoExtractor class VideoPremiumIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?videopremium\.(?:tv|me)/(?P<id>\w+)(?:/.*)?' _TEST = { 'url': 'http://videopremium.tv/4w7oadjsf156', 'info_dict': { 'id': '4w7oadjsf156', 'ext': 'f4v', 'title': 'youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4' }, 'params': { 'skip_download': True, }, 'skip': 'Test file has been deleted.', } def _real_extract(self, url): video_id = self._match_id(url) webpage_url = 'http://videopremium.tv/' + video_id webpage = self._download_webpage(webpage_url, video_id) if re.match(r'^<html><head><script[^>]*>window.location\s*=', webpage): # Download again, we need a cookie webpage = self._download_webpage( webpage_url, video_id, note='Downloading webpage again (with cookie)') video_title = self._html_search_regex( r'<h2(?:.*?)>\s*(.+?)\s*<', webpage, 'video title') return { 'id': video_id, 'url': 'rtmp://e%d.md.iplay.md/play' % random.randint(1, 16), 'play_path': 'mp4:%s.f4v' % video_id, 'page_url': 'http://videopremium.tv/' + video_id, 'player_url': 'http://videopremium.tv/uplayer/uppod.swf', 'ext': 'f4v', 'title': video_title, } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/npo.py��������������������������������������������������������������0000644�0000000�0000000�00000043741�12653373215�017221� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( fix_xml_ampersands, parse_duration, qualities, strip_jsonp, unified_strdate, ) class NPOBaseIE(InfoExtractor): def _get_token(self, video_id): token_page = self._download_webpage( 'http://ida.omroep.nl/npoplayer/i.js', video_id, note='Downloading token') token = self._search_regex( r'npoplayer\.token = "(.+?)"', token_page, 'token') # Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js token_l = list(token) first = second = None for i in range(5, len(token_l) - 4): if token_l[i].isdigit(): if first is None: first = i elif second is None: second = i if first is None or second is None: first = 12 second = 13 token_l[first], token_l[second] = token_l[second], token_l[first] return ''.join(token_l) class NPOIE(NPOBaseIE): IE_NAME = 'npo' IE_DESC = 'npo.nl and ntr.nl' _VALID_URL = r'''(?x) (?: npo:| https?:// (?:www\.)? (?: npo\.nl/(?!live|radio)(?:[^/]+/){2}| ntr\.nl/(?:[^/]+/){2,}| omroepwnl\.nl/video/fragment/[^/]+__ ) ) (?P<id>[^/?#]+) ''' _TESTS = [ { 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', 'md5': '4b3f9c429157ec4775f2c9cb7b911016', 'info_dict': { 'id': 'VPWON_1220719', 'ext': 'm4v', 'title': 'Nieuwsuur', 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', 'upload_date': '20140622', }, }, { 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', 'info_dict': { 'id': 'VARA_101191800', 'ext': 'm4v', 'title': 'De Mega Mike & Mega Thomas show: The best of.', 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', 'upload_date': '20090227', 'duration': 2400, }, }, { 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', 'info_dict': { 'id': 'VPWON_1169289', 'ext': 'm4v', 'title': 'Tegenlicht: De toekomst komt uit Afrika', 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', 'duration': 3000, }, }, { 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706', 'info_dict': { 'id': 'WO_VPRO_043706', 'ext': 'wmv', 'title': 'De nieuwe mens - Deel 1', 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b', 'duration': 4680, }, 'params': { # mplayer mms download 'skip_download': True, } }, # non asf in streams { 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', 'md5': 'b3da13de374cbe2d5332a7e910bef97f', 'info_dict': { 'id': 'WO_NOS_762771', 'ext': 'mp4', 'title': 'Hoe gaat Europa verder na Parijs?', }, }, { 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', 'md5': '01c6a2841675995da1f0cf776f03a9c3', 'info_dict': { 'id': 'VPWON_1233944', 'ext': 'm4v', 'title': 'Aap, poot, pies', 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', 'upload_date': '20150508', 'duration': 599, }, }, { 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', 'md5': 'd30cd8417b8b9bca1fdff27428860d08', 'info_dict': { 'id': 'POW_00996502', 'ext': 'm4v', 'title': '''"Dit is wel een 'landslide'..."''', 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8', 'upload_date': '20150508', 'duration': 462, }, } ] def _real_extract(self, url): video_id = self._match_id(url) return self._get_info(video_id) def _get_info(self, video_id): metadata = self._download_json( 'http://e.omroep.nl/metadata/%s' % video_id, video_id, # We have to remove the javascript callback transform_source=strip_jsonp, ) # For some videos actual video id (prid) is different (e.g. for # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698 # video id is POMS_WNL_853698 but prid is POW_00996502) video_id = metadata.get('prid') or video_id # titel is too generic in some cases so utilize aflevering_titel as well # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html) title = metadata['titel'] sub_title = metadata.get('aflevering_titel') if sub_title and sub_title != title: title += ': %s' % sub_title token = self._get_token(video_id) formats = [] pubopties = metadata.get('pubopties') if pubopties: quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std']) for format_id in pubopties: format_info = self._download_json( 'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s' % (video_id, format_id, token), video_id, 'Downloading %s JSON' % format_id) if format_info.get('error_code', 0) or format_info.get('errorcode', 0): continue streams = format_info.get('streams') if streams: video_info = self._download_json( streams[0] + '&type=json', video_id, 'Downloading %s stream JSON' % format_id) else: video_info = format_info video_url = video_info.get('url') if not video_url: continue if format_id == 'adaptive': formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4')) else: formats.append({ 'url': video_url, 'format_id': format_id, 'quality': quality(format_id), }) streams = metadata.get('streams') if streams: for i, stream in enumerate(streams): stream_url = stream.get('url') if not stream_url: continue if '.asf' not in stream_url: formats.append({ 'url': stream_url, 'quality': stream.get('kwaliteit'), }) continue asx = self._download_xml( stream_url, video_id, 'Downloading stream %d ASX playlist' % i, transform_source=fix_xml_ampersands) ref = asx.find('./ENTRY/Ref') if ref is None: continue video_url = ref.get('href') if not video_url: continue formats.append({ 'url': video_url, 'ext': stream.get('formaat', 'asf'), 'quality': stream.get('kwaliteit'), }) self._sort_formats(formats) subtitles = {} if metadata.get('tt888') == 'ja': subtitles['nl'] = [{ 'ext': 'vtt', 'url': 'http://e.omroep.nl/tt888/%s' % video_id, }] return { 'id': video_id, 'title': title, 'description': metadata.get('info'), 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')), 'duration': parse_duration(metadata.get('tijdsduur')), 'formats': formats, 'subtitles': subtitles, } class NPOLiveIE(NPOBaseIE): IE_NAME = 'npo.nl:live' _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>.+)' _TEST = { 'url': 'http://www.npo.nl/live/npo-1', 'info_dict': { 'id': 'LI_NEDERLAND1_136692', 'display_id': 'npo-1', 'ext': 'mp4', 'title': 're:^Nederland 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'Livestream', 'is_live': True, }, 'params': { 'skip_download': True, } } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) live_id = self._search_regex( r'data-prid="([^"]+)"', webpage, 'live id') metadata = self._download_json( 'http://e.omroep.nl/metadata/%s' % live_id, display_id, transform_source=strip_jsonp) token = self._get_token(display_id) formats = [] streams = metadata.get('streams') if streams: for stream in streams: stream_type = stream.get('type').lower() # smooth streaming is not supported if stream_type in ['ss', 'ms']: continue stream_info = self._download_json( 'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp' % (stream.get('url'), token), display_id, 'Downloading %s JSON' % stream_type) if stream_info.get('error_code', 0) or stream_info.get('errorcode', 0): continue stream_url = self._download_json( stream_info['stream'], display_id, 'Downloading %s URL' % stream_type, 'Unable to download %s URL' % stream_type, transform_source=strip_jsonp, fatal=False) if not stream_url: continue if stream_type == 'hds': f4m_formats = self._extract_f4m_formats(stream_url, display_id) # f4m downloader downloads only piece of live stream for f4m_format in f4m_formats: f4m_format['preference'] = -1 formats.extend(f4m_formats) elif stream_type == 'hls': formats.extend(self._extract_m3u8_formats(stream_url, display_id, 'mp4')) else: formats.append({ 'url': stream_url, 'preference': -10, }) self._sort_formats(formats) return { 'id': live_id, 'display_id': display_id, 'title': self._live_title(metadata['titel']), 'description': metadata['info'], 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'formats': formats, 'is_live': True, } class NPORadioIE(InfoExtractor): IE_NAME = 'npo.nl:radio' _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)/?$' _TEST = { 'url': 'http://www.npo.nl/radio/radio-1', 'info_dict': { 'id': 'radio-1', 'ext': 'mp3', 'title': 're:^NPO Radio 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { 'skip_download': True, } } @staticmethod def _html_get_attribute_regex(attribute): return r'{0}\s*=\s*\'([^\']+)\''.format(attribute) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( self._html_get_attribute_regex('data-channel'), webpage, 'title') stream = self._parse_json( self._html_search_regex(self._html_get_attribute_regex('data-streams'), webpage, 'data-streams'), video_id) codec = stream.get('codec') return { 'id': video_id, 'url': stream['url'], 'title': self._live_title(title), 'acodec': codec, 'ext': codec, 'is_live': True, } class NPORadioFragmentIE(InfoExtractor): IE_NAME = 'npo.nl:radio:fragment' _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/[^/]+/fragment/(?P<id>\d+)' _TEST = { 'url': 'http://www.npo.nl/radio/radio-5/fragment/174356', 'md5': 'dd8cc470dad764d0fdc70a9a1e2d18c2', 'info_dict': { 'id': '174356', 'ext': 'mp3', 'title': 'Jubileumconcert Willeke Alberti', }, } def _real_extract(self, url): audio_id = self._match_id(url) webpage = self._download_webpage(url, audio_id) title = self._html_search_regex( r'href="/radio/[^/]+/fragment/%s" title="([^"]+)"' % audio_id, webpage, 'title') audio_url = self._search_regex( r"data-streams='([^']+)'", webpage, 'audio url') return { 'id': audio_id, 'url': audio_url, 'title': title, } class SchoolTVIE(InfoExtractor): IE_NAME = 'schooltv' _VALID_URL = r'https?://(?:www\.)?schooltv\.nl/video/(?P<id>[^/?#&]+)' _TEST = { 'url': 'http://www.schooltv.nl/video/ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam/', 'info_dict': { 'id': 'WO_NTR_429477', 'display_id': 'ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam', 'title': 'Ademhaling: De hele dag haal je adem. Maar wat gebeurt er dan eigenlijk in je lichaam?', 'ext': 'mp4', 'description': 'md5:abfa0ff690adb73fd0297fd033aaa631' }, 'params': { # Skip because of m3u8 download 'skip_download': True } } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex( r'data-mid=(["\'])(?P<id>.+?)\1', webpage, 'video_id', group='id') return { '_type': 'url_transparent', 'ie_key': 'NPO', 'url': 'npo:%s' % video_id, 'display_id': display_id } class VPROIE(NPOIE): IE_NAME = 'vpro' _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html' _TESTS = [ { 'url': 'http://tegenlicht.vpro.nl/afleveringen/2012-2013/de-toekomst-komt-uit-afrika.html', 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', 'info_dict': { 'id': 'VPWON_1169289', 'ext': 'm4v', 'title': 'De toekomst komt uit Afrika', 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', }, }, { 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html', 'info_dict': { 'id': 'sergio-herman', 'title': 'Sergio Herman: Fucking perfect', }, 'playlist_count': 2, }, { # playlist with youtube embed 'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html', 'info_dict': { 'id': 'education-education', 'title': '2Doc', }, 'playlist_count': 2, } ] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) entries = [ self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) for video_id in re.findall(r'data-media-id="([^"]+)"', webpage) ] playlist_title = self._search_regex( r'<title>\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*', webpage, 'playlist title', default=None) or self._og_search_title(webpage) return self.playlist_result(entries, playlist_id, playlist_title) class WNLIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P[^/]+)__\d+' _TEST = { 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515', 'info_dict': { 'id': 'vandaag-de-dag-6-mei', 'title': 'Vandaag de Dag 6 mei', }, 'playlist_count': 4, } def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) entries = [ self.url_result('npo:%s' % video_id, 'NPO') for video_id, part in re.findall( r']+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage) ] playlist_title = self._html_search_regex( r'(?s)]+class="subject"[^>]*>(.+?)', webpage, 'playlist title') return self.playlist_result(entries, playlist_id, playlist_title) youtube-dl/youtube_dl/extractor/folketinget.py0000644000000000000000000000513312641030331020713 0ustar rootroot# -*- coding: utf-8 -*- from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_parse_qs from ..utils import ( int_or_none, parse_duration, parse_iso8601, xpath_text, ) class FolketingetIE(InfoExtractor): IE_DESC = 'Folketinget (ft.dk; Danish parliament)' _VALID_URL = r'https?://(?:www\.)?ft\.dk/webtv/video/[^?#]*?\.(?P[0-9]+)\.aspx' _TEST = { 'url': 'http://www.ft.dk/webtv/video/20141/eru/td.1165642.aspx?as=1#player', 'md5': '6269e8626fa1a891bf5369b386ae996a', 'info_dict': { 'id': '1165642', 'ext': 'mp4', 'title': 'Åbent samråd i Erhvervsudvalget', 'description': 'Åbent samråd med erhvervs- og vækstministeren om regeringens politik på teleområdet', 'view_count': int, 'width': 768, 'height': 432, 'tbr': 928000, 'timestamp': 1416493800, 'upload_date': '20141120', 'duration': 3960, }, 'params': { # rtmp download 'skip_download': True, }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) description = self._html_search_regex( r'(?s)
    ]*>(.*?)<', webpage, 'description', fatal=False) player_params = compat_parse_qs(self._search_regex( r'[^/?#]+)' _TEST = { 'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test', 'md5': '79a8b71c46d49042609795ab59779b66', 'info_dict': { 'id': '1565', 'ext': 'mp4', 'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test', 'description': 'md5:60e5d30a78ced725f7b8d34370762941', 'upload_date': '20151124', 'timestamp': 1448354940, 'duration': 74022, 'view_count': int, } } # extracted from https://audimedia.tv/assets/embed/embedded-player.js (dataSourceAuthToken) _AUTH_TOKEN = 'e25b42847dba18c6c8816d5d8ce94c326e06823ebf0859ed164b3ba169be97f2' def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) raw_payload = self._search_regex(r']+class="amtv-embed"[^>]+id="([^"]+)"', webpage, 'raw payload') _, stage_mode, video_id, lang = raw_payload.split('-') # TODO: handle s and e stage_mode (live streams and ended live streams) if stage_mode not in ('s', 'e'): request = sanitized_Request( 'https://audimedia.tv/api/video/v1/videos/%s?embed[]=video_versions&embed[]=thumbnail_image&where[content_language_iso]=%s' % (video_id, lang), headers={'X-Auth-Token': self._AUTH_TOKEN}) json_data = self._download_json(request, video_id)['results'] formats = [] stream_url_hls = json_data.get('stream_url_hls') if stream_url_hls: formats.extend(self._extract_m3u8_formats( stream_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) stream_url_hds = json_data.get('stream_url_hds') if stream_url_hds: formats.extend(self._extract_f4m_formats( stream_url_hds + '?hdcore=3.4.0', video_id, f4m_id='hds', fatal=False)) for video_version in json_data.get('video_versions'): video_version_url = video_version.get('download_url') or video_version.get('stream_url') if not video_version_url: continue formats.append({ 'url': video_version_url, 'width': int_or_none(video_version.get('width')), 'height': int_or_none(video_version.get('height')), 'abr': int_or_none(video_version.get('audio_bitrate')), 'vbr': int_or_none(video_version.get('video_bitrate')), }) self._sort_formats(formats) return { 'id': video_id, 'title': json_data['title'], 'description': json_data.get('subtitle'), 'thumbnail': json_data.get('thumbnail_image', {}).get('file'), 'timestamp': parse_iso8601(json_data.get('publication_date')), 'duration': int_or_none(json_data.get('duration')), 'view_count': int_or_none(json_data.get('view_count')), 'formats': formats, } youtube-dl/youtube_dl/extractor/webofstories.py0000644000000000000000000001152112641030331021111 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import int_or_none class WebOfStoriesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?webofstories\.com/play/(?:[^/]+/)?(?P[0-9]+)' _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/' _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/' _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/' _TESTS = [ { 'url': 'http://www.webofstories.com/play/hans.bethe/71', 'md5': '373e4dd915f60cfe3116322642ddf364', 'info_dict': { 'id': '4536', 'ext': 'mp4', 'title': 'The temperature of the sun', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'Hans Bethe talks about calculating the temperature of the sun', 'duration': 238, } }, { 'url': 'http://www.webofstories.com/play/55908', 'md5': '2985a698e1fe3211022422c4b5ed962c', 'info_dict': { 'id': '55908', 'ext': 'mp4', 'title': 'The story of Gemmata obscuriglobus', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', 'duration': 169, } }, ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) description = self._html_search_meta('description', webpage) thumbnail = self._og_search_thumbnail(webpage) embed_params = [s.strip(" \r\n\t'") for s in self._search_regex( r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)', webpage, 'embed params').split(',')] ( _, speaker_id, story_id, story_duration, speaker_type, great_life, _thumbnail, _has_subtitles, story_filename, _story_order) = embed_params is_great_life_series = great_life == 'true' duration = int_or_none(story_duration) # URL building, see: http://www.webofstories.com/scripts/player.js ms_prefix = '' if speaker_type.lower() == 'ms': ms_prefix = 'mini_sites/' if is_great_life_series: mp4_url = '{0:}lives/{1:}/{2:}.mp4'.format( self._VIDEO_DOMAIN, speaker_id, story_filename) rtmp_ext = 'flv' streamer = self._GREAT_LIFE_STREAMER play_path = 'stories/{0:}/{1:}'.format( speaker_id, story_filename) else: mp4_url = '{0:}{1:}{2:}/{3:}.mp4'.format( self._VIDEO_DOMAIN, ms_prefix, speaker_id, story_filename) rtmp_ext = 'mp4' streamer = self._USER_STREAMER play_path = 'mp4:{0:}{1:}/{2}.mp4'.format( ms_prefix, speaker_id, story_filename) formats = [{ 'format_id': 'mp4_sd', 'url': mp4_url, }, { 'format_id': 'rtmp_sd', 'page_url': url, 'url': streamer, 'ext': rtmp_ext, 'play_path': play_path, }] self._sort_formats(formats) return { 'id': story_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'description': description, 'duration': duration, } class WebOfStoriesPlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P[^/]+)' _TEST = { 'url': 'http://www.webofstories.com/playAll/donald.knuth', 'info_dict': { 'id': 'donald.knuth', 'title': 'Donald Knuth (Scientist)', }, 'playlist_mincount': 97, } def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) entries = [ self.url_result('http://www.webofstories.com/play/%s' % video_number, 'WebOfStories') for video_number in set(re.findall('href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage)) ] title = self._search_regex( r'
    \s*([^<]+)', webpage, 'speaker', default=None) if title: field = self._search_regex( r'([^<]+)', webpage, 'field', default=None) if field: title += ' (%s)' % field if not title: title = self._search_regex( r'Play\s+all\s+stories\s*-\s*([^<]+)\s*-\s*Web\s+of\s+Stories', webpage, 'title') return self.playlist_result(entries, playlist_id, title) youtube-dl/youtube_dl/extractor/elpais.py0000644000000000000000000000407612641030331017662 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import unified_strdate class ElPaisIE(InfoExtractor): _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P[^/#?]+)\.html(?:$|[?#])' IE_DESC = 'El País' _TEST = { 'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html', 'md5': '98406f301f19562170ec071b83433d55', 'info_dict': { 'id': 'tiempo-nuevo-recetas-viejas', 'ext': 'mp4', 'title': 'Tiempo nuevo, recetas viejas', 'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.', 'upload_date': '20140206', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) prefix = self._html_search_regex( r'var url_cache = "([^"]+)";', webpage, 'URL prefix') video_suffix = self._search_regex( r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL') video_url = prefix + video_suffix thumbnail_suffix = self._search_regex( r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL', fatal=False) thumbnail = ( None if thumbnail_suffix is None else prefix + thumbnail_suffix) title = self._html_search_regex( '

    ', webpage, 'upload date', fatal=False) upload_date = (None if date_str is None else unified_strdate(date_str)) return { 'id': video_id, 'url': video_url, 'title': title, 'description': self._og_search_description(webpage), 'thumbnail': thumbnail, 'upload_date': upload_date, } youtube-dl/youtube_dl/extractor/safari.py0000644000000000000000000001226712641030331017653 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE from ..utils import ( ExtractorError, sanitized_Request, smuggle_url, std_headers, urlencode_postdata, ) class SafariBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' _SUCCESSFUL_LOGIN_REGEX = r']*>Sign Out' _NETRC_MACHINE = 'safari' _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' _API_FORMAT = 'json' LOGGED_IN = False def _real_initialize(self): # We only need to log in once for courses or individual videos if not self.LOGGED_IN: self._login() SafariBaseIE.LOGGED_IN = True def _login(self): (username, password) = self._get_login_info() if username is None: self.raise_login_required('safaribooksonline.com account is required') headers = std_headers if 'Referer' not in headers: headers['Referer'] = self._LOGIN_URL login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login form') csrf = self._html_search_regex( r"name='csrfmiddlewaretoken'\s+value='([^']+)'", login_page, 'csrf token') login_form = { 'csrfmiddlewaretoken': csrf, 'email': username, 'password1': password, 'login': 'Sign In', 'next': '', } request = sanitized_Request( self._LOGIN_URL, urlencode_postdata(login_form), headers=headers) login_page = self._download_webpage( request, None, 'Logging in as %s' % username) if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: raise ExtractorError( 'Login failed; make sure your credentials are correct and try again.', expected=True) self.to_screen('Login successful') class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' _VALID_URL = r'''(?x)https?:// (?:www\.)?safaribooksonline\.com/ (?: library/view/[^/]+| api/v1/book )/ (?P[^/]+)/ (?:chapter(?:-content)?/)? (?Ppart\d+)\.html ''' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', 'md5': '5b0c4cc1b3c1ba15dda7344085aa5592', 'info_dict': { 'id': '2842601850001', 'ext': 'mp4', 'title': 'Introduction', }, 'skip': 'Requires safaribooksonline account credentials', }, { 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', 'only_matching': True, }, { # non-digits in course id 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) course_id = mobj.group('course_id') part = mobj.group('part') webpage = self._download_webpage( '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part), part) bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) if not bc_url: raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy') class SafariCourseIE(SafariBaseIE): IE_NAME = 'safari:course' IE_DESC = 'safaribooksonline.com online courses' _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P[^/]+)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', 'info_dict': { 'id': '9780133392838', 'title': 'Hadoop Fundamentals LiveLessons', }, 'playlist_count': 22, 'skip': 'Requires safaribooksonline account credentials', }, { 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', 'only_matching': True, }] def _real_extract(self, url): course_id = self._match_id(url) course_json = self._download_json( '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), course_id, 'Downloading course JSON') if 'chapters' not in course_json: raise ExtractorError( 'No chapters found for course %s' % course_id, expected=True) entries = [ self.url_result(chapter, 'Safari') for chapter in course_json['chapters']] course_title = course_json['title'] return self.playlist_result(entries, course_id, course_title) youtube-dl/youtube_dl/extractor/vice.py0000644000000000000000000000255712641030331017335 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor from .ooyala import OoyalaIE from ..utils import ExtractorError class ViceIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P.+)' _TESTS = [ { 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1', 'info_dict': { 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', 'ext': 'mp4', 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', 'duration': 725.983, }, 'params': { # Requires ffmpeg (m3u8 manifest) 'skip_download': True, }, }, { 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', 'only_matching': True, } ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) try: embed_code = self._search_regex( r'embedCode=([^&\'"]+)', webpage, 'ooyala embed code') ooyala_url = OoyalaIE._url_for_embed_code(embed_code) except ExtractorError: raise ExtractorError('The page doesn\'t contain a video', expected=True) return self.url_result(ooyala_url, ie='Ooyala') youtube-dl/youtube_dl/extractor/dotsub.py0000644000000000000000000000361112641030331017677 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( float_or_none, int_or_none, ) class DotsubIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dotsub\.com/view/(?P[^/]+)' _TEST = { 'url': 'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27', 'md5': '0914d4d69605090f623b7ac329fea66e', 'info_dict': { 'id': 'aed3b8b2-1889-4df5-ae63-ad85f5572f27', 'ext': 'flv', 'title': 'Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary', 'description': 'md5:699a0f7f50aeec6042cb3b1db2d0d074', 'thumbnail': 're:^https?://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p', 'duration': 3169, 'uploader': '4v4l0n42', 'timestamp': 1292248482.625, 'upload_date': '20101213', 'view_count': int, } } def _real_extract(self, url): video_id = self._match_id(url) info = self._download_json( 'https://dotsub.com/api/media/%s/metadata' % video_id, video_id) video_url = info.get('mediaURI') if not video_url: webpage = self._download_webpage(url, video_id) video_url = self._search_regex( [r']+src="([^"]+)"', r'"file"\s*:\s*\'([^\']+)'], webpage, 'video url') return { 'id': video_id, 'url': video_url, 'ext': 'flv', 'title': info['title'], 'description': info.get('description'), 'thumbnail': info.get('screenshotURI'), 'duration': int_or_none(info.get('duration'), 1000), 'uploader': info.get('user'), 'timestamp': float_or_none(info.get('dateCreated'), 1000), 'view_count': int_or_none(info.get('numberOfViews')), } youtube-dl/youtube_dl/extractor/eroprofile.py0000644000000000000000000000620612641030331020550 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, unescapeHTML ) class EroProfileIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P[^/]+)' _LOGIN_URL = 'http://www.eroprofile.com/auth/auth.php?' _NETRC_MACHINE = 'eroprofile' _TESTS = [{ 'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore', 'md5': 'c26f351332edf23e1ea28ce9ec9de32f', 'info_dict': { 'id': '3733775', 'display_id': 'sexy-babe-softcore', 'ext': 'm4v', 'title': 'sexy babe softcore', 'thumbnail': 're:https?://.*\.jpg', 'age_limit': 18, } }, { 'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file', 'md5': '1baa9602ede46ce904c431f5418d8916', 'info_dict': { 'id': '1133519', 'ext': 'm4v', 'title': 'Try It On Pee_cut_2.wmv - 4shared.com - file sharing - download movie file', 'thumbnail': 're:https?://.*\.jpg', 'age_limit': 18, }, 'skip': 'Requires login', }] def _login(self): (username, password) = self._get_login_info() if username is None: return query = compat_urllib_parse.urlencode({ 'username': username, 'password': password, 'url': 'http://www.eroprofile.com/', }) login_url = self._LOGIN_URL + query login_page = self._download_webpage(login_url, None, False) m = re.search(r'Your username or password was incorrect\.', login_page) if m: raise ExtractorError( 'Wrong username and/or password.', expected=True) self.report_login() redirect_url = self._search_regex( r']+?src="([^"]+)"', login_page, 'login redirect url') self._download_webpage(redirect_url, None, False) def _real_initialize(self): self._login() def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) m = re.search(r'You must be logged in to view this video\.', webpage) if m: self.raise_login_required('This video requires login') video_id = self._search_regex( [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], webpage, 'video id', default=None) video_url = unescapeHTML(self._search_regex( r'([^<]+)', webpage, 'title') thumbnail = self._search_regex( r'onclick="showVideoPlayer\(\)">\d+)' _TESTS = [ # Single movie { 'url': 'http://www.ivi.ru/watch/53141', 'md5': '6ff5be2254e796ed346251d117196cf4', 'info_dict': { 'id': '53141', 'ext': 'mp4', 'title': 'Иван Васильевич меняет профессию', 'description': 'md5:b924063ea1677c8fe343d8a72ac2195f', 'duration': 5498, 'thumbnail': 're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', }, # Serial's series { 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/9549', 'md5': '221f56b35e3ed815fde2df71032f4b3e', 'info_dict': { 'id': '9549', 'ext': 'mp4', 'title': 'Двое из ларца - Дело Гольдберга (1 часть)', 'series': 'Двое из ларца', 'season': 'Сезон 1', 'season_number': 1, 'episode': 'Дело Гольдберга (1 часть)', 'episode_number': 1, 'duration': 2655, 'thumbnail': 're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', } ] # Sorted by quality _KNOWN_FORMATS = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ'] def _real_extract(self, url): video_id = self._match_id(url) data = { 'method': 'da.content.get', 'params': [ video_id, { 'site': 's183', 'referrer': 'http://www.ivi.ru/watch/%s' % video_id, 'contentid': video_id } ] } request = sanitized_Request( 'http://api.digitalaccess.ru/api/json/', json.dumps(data)) video_json = self._download_json( request, video_id, 'Downloading video JSON') if 'error' in video_json: error = video_json['error'] if error['origin'] == 'NoRedisValidData': raise ExtractorError('Video %s does not exist' % video_id, expected=True) raise ExtractorError( 'Unable to download video %s: %s' % (video_id, error['message']), expected=True) result = video_json['result'] formats = [{ 'url': x['url'], 'format_id': x['content_format'], 'preference': self._KNOWN_FORMATS.index(x['content_format']), } for x in result['files'] if x['content_format'] in self._KNOWN_FORMATS] self._sort_formats(formats) title = result['title'] duration = int_or_none(result.get('duration')) compilation = result.get('compilation') episode = title if compilation else None title = '%s - %s' % (compilation, title) if compilation is not None else title thumbnails = [{ 'url': preview['url'], 'id': preview.get('content_format'), } for preview in result.get('preview', []) if preview.get('url')] webpage = self._download_webpage(url, video_id) season = self._search_regex( r']+class="season active"[^>]*>]+>([^<]+)', webpage, 'season', default=None) season_number = int_or_none(self._search_regex( r']+class="season active"[^>]*>]+data-season(?:-index)?="(\d+)"', webpage, 'season number', default=None)) episode_number = int_or_none(self._search_regex( r']+itemprop="episode"[^>]*>\s*]+itemprop="episodeNumber"[^>]+content="(\d+)', webpage, 'episode number', default=None)) description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'description', webpage, 'description', default=None) return { 'id': video_id, 'title': title, 'series': compilation, 'season': season, 'season_number': season_number, 'episode': episode, 'episode_number': episode_number, 'thumbnails': thumbnails, 'description': description, 'duration': duration, 'formats': formats, } class IviCompilationIE(InfoExtractor): IE_DESC = 'ivi.ru compilations' IE_NAME = 'ivi:compilation' _VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P[a-z\d_-]+)(?:/season(?P\d+))?$' _TESTS = [{ 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa', 'info_dict': { 'id': 'dvoe_iz_lartsa', 'title': 'Двое из ларца (2006 - 2008)', }, 'playlist_mincount': 24, }, { 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/season1', 'info_dict': { 'id': 'dvoe_iz_lartsa/season1', 'title': 'Двое из ларца (2006 - 2008) 1 сезон', }, 'playlist_mincount': 12, }] def _extract_entries(self, html, compilation_id): return [ self.url_result( 'http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), IviIE.ie_key()) for serie in re.findall( r']+data-id="\1"' % compilation_id, html)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) compilation_id = mobj.group('compilationid') season_id = mobj.group('seasonid') if season_id is not None: # Season link season_page = self._download_webpage( url, compilation_id, 'Downloading season %s web page' % season_id) playlist_id = '%s/season%s' % (compilation_id, season_id) playlist_title = self._html_search_meta('title', season_page, 'title') entries = self._extract_entries(season_page, compilation_id) else: # Compilation link compilation_page = self._download_webpage(url, compilation_id, 'Downloading compilation web page') playlist_id = compilation_id playlist_title = self._html_search_meta('title', compilation_page, 'title') seasons = re.findall( r'\s*]*>\s*\s*]+href="([^"]+)"', webpage, 'm3u8 url', default=None) if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, page_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) direct_urls = re.findall( r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage) if direct_urls: for quality, video_url in direct_urls: formats.append({ 'url': video_url, 'preference': preference(quality), 'http_headers': { 'User-Agent': 'mobile', }, }) self._sort_formats(formats) description = self._html_search_meta('Description', webpage, 'description') return { 'id': page_id, 'formats': formats, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'is_live': is_live } class WDRMobileIE(InfoExtractor): _VALID_URL = r'''(?x) https?://mobile-ondemand\.wdr\.de/ .*?/fsk(?P[0-9]+) /[0-9]+/[0-9]+/ (?P[0-9]+)_(?P[0-9]+)''' IE_NAME = 'wdr:mobile' _TEST = { 'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4', 'info_dict': { 'title': '4283021', 'id': '421735', 'ext': 'mp4', 'age_limit': 0, }, 'skip': 'Problems with loading data.' } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) return { 'id': mobj.group('id'), 'title': mobj.group('title'), 'age_limit': int(mobj.group('age_limit')), 'url': url, 'http_headers': { 'User-Agent': 'mobile', }, } class WDRMausIE(InfoExtractor): _VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))' IE_DESC = 'Sendung mit der Maus' _TESTS = [{ 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'info_dict': { 'id': 'aktuelle-sendung', 'ext': 'mp4', 'thumbnail': 're:^http://.+\.jpg', 'upload_date': 're:^[0-9]{8}$', 'title': 're:^[0-9.]{10} - Aktuelle Sendung$', } }, { 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/40_jahre_maus.php5', 'md5': '3b1227ca3ed28d73ec5737c65743b2a3', 'info_dict': { 'id': '40_jahre_maus', 'ext': 'mp4', 'thumbnail': 're:^http://.+\.jpg', 'upload_date': '20131007', 'title': '12.03.2011 - 40 Jahre Maus', } }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) param_code = self._html_search_regex( r'<a href="\?startVideo=1&([^"]+)"', webpage, 'parameters') title_date = self._search_regex( r'<div class="sendedatum"><p>Sendedatum:\s*([0-9\.]+)</p>', webpage, 'air date') title_str = self._html_search_regex( r'<h1>(.*?)</h1>', webpage, 'title') title = '%s - %s' % (title_date, title_str) upload_date = unified_strdate( self._html_search_meta('dc.date', webpage)) fields = compat_parse_qs(param_code) video_url = fields['firstVideo'][0] thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0]) formats = [{ 'format_id': 'rtmp', 'url': video_url, }] jscode = self._download_webpage( 'http://www.wdrmaus.de/codebase/js/extended-medien.min.js', video_id, fatal=False, note='Downloading URL translation table', errnote='Could not download URL translation table') if jscode: for m in re.finditer( r"stream:\s*'dslSrc=(?P<stream>[^']+)',\s*download:\s*'(?P<dl>[^']+)'\s*\}", jscode): if video_url.startswith(m.group('stream')): http_url = video_url.replace( m.group('stream'), m.group('dl')) formats.append({ 'format_id': 'http', 'url': http_url, }) break self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'upload_date': upload_date, } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/canvas.py�����������������������������������������������������������0000644�0000000�0000000�00000007015�12662061715�017671� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..utils import float_or_none class CanvasIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?canvas\.be/video/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', 'md5': 'ea838375a547ac787d4064d8c7860a6c', 'info_dict': { 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', 'ext': 'mp4', 'title': 'De afspraak veilt voor de Warmste Week', 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 49.02, } }, { # with subtitles 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', 'info_dict': { 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', 'display_id': 'pieter-0167', 'ext': 'mp4', 'title': 'Pieter 0167', 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 2553.08, 'subtitles': { 'nl': [{ 'ext': 'vtt', }], }, }, 'params': { 'skip_download': True, } }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) title = self._search_regex( r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', webpage, 'title', default=None) or self._og_search_title(webpage) video_id = self._html_search_regex( r'data-video=(["\'])(?P<id>.+?)\1', webpage, 'video id', group='id') data = self._download_json( 'https://mediazone.vrt.be/api/v1/canvas/assets/%s' % video_id, display_id) formats = [] for target in data['targetUrls']: format_url, format_type = target.get('url'), target.get('type') if not format_url or not format_type: continue if format_type == 'HLS': formats.extend(self._extract_m3u8_formats( format_url, display_id, entry_protocol='m3u8_native', ext='mp4', preference=0, fatal=False, m3u8_id=format_type)) elif format_type == 'HDS': formats.extend(self._extract_f4m_formats( format_url, display_id, f4m_id=format_type, fatal=False)) else: formats.append({ 'format_id': format_type, 'url': format_url, }) self._sort_formats(formats) subtitles = {} subtitle_urls = data.get('subtitleUrls') if isinstance(subtitle_urls, list): for subtitle in subtitle_urls: subtitle_url = subtitle.get('url') if subtitle_url and subtitle.get('type') == 'CLOSED': subtitles.setdefault('nl', []).append({'url': subtitle_url}) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': self._og_search_description(webpage), 'formats': formats, 'duration': float_or_none(data.get('duration'), 1000), 'thumbnail': data.get('posterImageUrl'), 'subtitles': subtitles, } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/beeg.py�������������������������������������������������������������0000644�0000000�0000000�00000006314�12645732000�017312� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_chr, compat_ord, compat_urllib_parse_unquote, ) from ..utils import ( int_or_none, parse_iso8601, ) class BeegIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)' _TEST = { 'url': 'http://beeg.com/5416503', 'md5': '46c384def73b33dbc581262e5ee67cef', 'info_dict': { 'id': '5416503', 'ext': 'mp4', 'title': 'Sultry Striptease', 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', 'timestamp': 1391813355, 'upload_date': '20140207', 'duration': 383, 'tags': list, 'age_limit': 18, } } def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( 'https://api.beeg.com/api/v5/video/%s' % video_id, video_id) def split(o, e): def cut(s, x): n.append(s[:x]) return s[x:] n = [] r = len(o) % e if r > 0: o = cut(o, r) while len(o) > e: o = cut(o, e) n.append(o) return n def decrypt_key(key): # Reverse engineered from http://static.beeg.com/cpl/1105.js a = '5ShMcIQlssOd7zChAIOlmeTZDaUxULbJRnywYaiB' e = compat_urllib_parse_unquote(key) o = ''.join([ compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21) for n in range(len(e))]) return ''.join(split(o, 3)[::-1]) def decrypt_url(encrypted_url): encrypted_url = self._proto_relative_url( encrypted_url.replace('{DATA_MARKERS}', ''), 'https:') key = self._search_regex( r'/key=(.*?)%2Cend=', encrypted_url, 'key', default=None) if not key: return encrypted_url return encrypted_url.replace(key, decrypt_key(key)) formats = [] for format_id, video_url in video.items(): if not video_url: continue height = self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None) if not height: continue formats.append({ 'url': decrypt_url(video_url), 'format_id': format_id, 'height': int(height), }) self._sort_formats(formats) title = video['title'] video_id = video.get('id') or video_id display_id = video.get('code') description = video.get('desc') timestamp = parse_iso8601(video.get('date'), ' ') duration = int_or_none(video.get('duration')) tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'timestamp': timestamp, 'duration': duration, 'tags': tags, 'formats': formats, 'age_limit': 18, } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/telebruxelles.py����������������������������������������������������0000644�0000000�0000000�00000004460�12641030331�021261� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class TeleBruxellesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?telebruxelles\.be/(news|sport|dernier-jt)/?(?P<id>[^/#?]+)' _TESTS = [{ 'url': 'http://www.telebruxelles.be/news/auditions-devant-parlement-francken-galant-tres-attendus/', 'md5': '59439e568c9ee42fb77588b2096b214f', 'info_dict': { 'id': '11942', 'display_id': 'auditions-devant-parlement-francken-galant-tres-attendus', 'ext': 'flv', 'title': 'Parlement : Francken et Galant répondent aux interpellations de l’opposition', 'description': 're:Les auditions des ministres se poursuivent*' }, 'params': { 'skip_download': 'requires rtmpdump' }, }, { 'url': 'http://www.telebruxelles.be/sport/basket-brussels-bat-mons-80-74/', 'md5': '181d3fbdcf20b909309e5aef5c6c6047', 'info_dict': { 'id': '10091', 'display_id': 'basket-brussels-bat-mons-80-74', 'ext': 'flv', 'title': 'Basket : le Brussels bat Mons 80-74', 'description': 're:^Ils l\u2019on fait ! En basket, le B*', }, 'params': { 'skip_download': 'requires rtmpdump' }, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) article_id = self._html_search_regex( r"<article id=\"post-(\d+)\"", webpage, 'article ID') title = self._html_search_regex( r'<h1 class=\"entry-title\">(.*?)</h1>', webpage, 'title') description = self._og_search_description(webpage) rtmp_url = self._html_search_regex( r"file: \"(rtmp://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}/vod/mp4:\" \+ \"\w+\" \+ \".mp4)\"", webpage, 'RTMP url') rtmp_url = rtmp_url.replace("\" + \"", "") return { 'id': article_id, 'display_id': display_id, 'title': title, 'description': description, 'url': rtmp_url, 'ext': 'flv', 'rtmp_live': True # if rtmpdump is not called with "--live" argument, the download is blocked and can be completed } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/pladform.py���������������������������������������������������������0000644�0000000�0000000�00000006501�12641030331�020204� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, xpath_text, qualities, ) class PladformIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: (?: out\.pladform\.ru/player| static\.pladform\.ru/player\.swf ) \?.*\bvideoid=| video\.pladform\.ru/catalog/video/videoid/ ) (?P<id>\d+) ''' _TESTS = [{ # http://muz-tv.ru/kinozal/view/7400/ 'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293', 'md5': '61f37b575dd27f1bb2e1854777fe31f4', 'info_dict': { 'id': '100183293', 'ext': 'mp4', 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 694, 'age_limit': 0, }, }, { 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', 'only_matching': True, }, { 'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', 'only_matching': True, }] @staticmethod def _extract_url(webpage): mobj = re.search( r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage) if mobj: return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) video = self._download_xml( 'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id, video_id) if video.tag == 'error': raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, video.text), expected=True) quality = qualities(('ld', 'sd', 'hd')) formats = [{ 'url': src.text, 'format_id': src.get('quality'), 'quality': quality(src.get('quality')), } for src in video.findall('./src')] self._sort_formats(formats) webpage = self._download_webpage( 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, video_id) title = self._og_search_title(webpage, fatal=False) or xpath_text( video, './/title', 'title', fatal=True) description = self._search_regex( r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False) thumbnail = self._og_search_thumbnail(webpage) or xpath_text( video, './/cover', 'cover') duration = int_or_none(xpath_text(video, './/time', 'duration')) age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'age_limit': age_limit, 'formats': formats, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/tubitv.py�����������������������������������������������������������0000644�0000000�0000000�00000005132�12641030331�017714� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import codecs import re from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, ) class TubiTvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)' _LOGIN_URL = 'http://tubitv.com/login' _NETRC_MACHINE = 'tubitv' _TEST = { 'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01', 'info_dict': { 'id': '54411', 'ext': 'mp4', 'title': 'The Kitchen Musical - EP01', 'thumbnail': 're:^https?://.*\.png$', 'description': 'md5:37532716166069b353e8866e71fefae7', 'duration': 2407, }, 'params': { 'skip_download': 'HLS download', }, } def _login(self): (username, password) = self._get_login_info() if username is None: return self.report_login() form_data = { 'username': username, 'password': password, } payload = compat_urllib_parse.urlencode(form_data).encode('utf-8') request = sanitized_Request(self._LOGIN_URL, payload) request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_page = self._download_webpage( request, None, False, 'Wrong login info') if not re.search(r'id="tubi-logout"', login_page): raise ExtractorError( 'Login failed (invalid username/password)', expected=True) def _real_initialize(self): self._login() def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage): self.raise_login_required('This video requires login') title = self._og_search_title(webpage) description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) duration = int_or_none(self._html_search_meta( 'video:duration', webpage, 'duration')) apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu') m3u8_url = codecs.decode(apu, 'rot_13')[::-1] formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'description': description, 'duration': duration, } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/dcn.py��������������������������������������������������������������0000644�0000000�0000000�00000016621�12645665720�017174� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re import base64 from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_str, ) from ..utils import ( int_or_none, parse_iso8601, sanitized_Request, smuggle_url, unsmuggle_url, ) class DCNIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' def _real_extract(self, url): show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() if video_id and int(video_id) > 0: return self.url_result( 'http://www.dcndigital.ae/media/%s' % video_id, 'DCNVideo') elif season_id and int(season_id) > 0: return self.url_result(smuggle_url( 'http://www.dcndigital.ae/program/season/%s' % season_id, {'show_id': show_id}), 'DCNSeason') else: return self.url_result( 'http://www.dcndigital.ae/program/%s' % show_id, 'DCNSeason') class DCNBaseIE(InfoExtractor): def _extract_video_info(self, video_data, video_id, is_live): title = video_data.get('title_en') or video_data['title_ar'] img = video_data.get('img') thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None duration = int_or_none(video_data.get('duration')) description = video_data.get('description_en') or video_data.get('description_ar') timestamp = parse_iso8601(video_data.get('create_time'), ' ') return { 'id': video_id, 'title': self._live_title(title) if is_live else title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'timestamp': timestamp, 'is_live': is_live, } def _extract_video_formats(self, webpage, video_id, entry_protocol): formats = [] m3u8_url = self._html_search_regex( r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False) if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None)) rtsp_url = self._search_regex( r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) if rtsp_url: formats.append({ 'url': rtsp_url, 'format_id': 'rtsp', }) self._sort_formats(formats) return formats class DCNVideoIE(DCNBaseIE): IE_NAME = 'dcn:video' _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' _TEST = { 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', 'info_dict': { 'id': '17375', 'ext': 'mp4', 'title': 'رحلة العمر : الحلقة 1', 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6', 'duration': 2041, 'timestamp': 1227504126, 'upload_date': '20081124', }, 'params': { # m3u8 download 'skip_download': True, }, } def _real_extract(self, url): video_id = self._match_id(url) request = sanitized_Request( 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, headers={'Origin': 'http://www.dcndigital.ae'}) video_data = self._download_json(request, video_id) info = self._extract_video_info(video_data, video_id, False) webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + compat_urllib_parse.urlencode({ 'id': video_data['id'], 'user_id': video_data['user_id'], 'signature': video_data['signature'], 'countries': 'Q0M=', 'filter': 'DENY', }), video_id) info['formats'] = self._extract_video_formats(webpage, video_id, 'm3u8_native') return info class DCNLiveIE(DCNBaseIE): IE_NAME = 'dcn:live' _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)' def _real_extract(self, url): channel_id = self._match_id(url) request = sanitized_Request( 'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, headers={'Origin': 'http://www.dcndigital.ae'}) channel_data = self._download_json(request, channel_id) info = self._extract_video_info(channel_data, channel_id, True) webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + compat_urllib_parse.urlencode({ 'id': base64.b64encode(channel_data['user_id'].encode()).decode(), 'channelid': base64.b64encode(channel_data['id'].encode()).decode(), 'signature': channel_data['signature'], 'countries': 'Q0M=', 'filter': 'DENY', }), channel_id) info['formats'] = self._extract_video_formats(webpage, channel_id, 'm3u8') return info class DCNSeasonIE(InfoExtractor): IE_NAME = 'dcn:season' _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' _TEST = { 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', 'info_dict': { 'id': '7910', 'title': 'محاضرات الشيخ الشعراوي', }, 'playlist_mincount': 27, } def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) show_id, season_id = re.match(self._VALID_URL, url).groups() data = {} if season_id: data['season'] = season_id show_id = smuggled_data.get('show_id') if show_id is None: request = sanitized_Request( 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, headers={'Origin': 'http://www.dcndigital.ae'}) season = self._download_json(request, season_id) show_id = season['id'] data['show_id'] = show_id request = sanitized_Request( 'http://admin.mangomolo.com/analytics/index.php/plus/show', compat_urllib_parse.urlencode(data), { 'Origin': 'http://www.dcndigital.ae', 'Content-Type': 'application/x-www-form-urlencoded' }) show = self._download_json(request, show_id) if not season_id: season_id = show['default_season'] for season in show['seasons']: if season['id'] == season_id: title = season.get('title_en') or season['title_ar'] entries = [] for video in show['videos']: video_id = compat_str(video['id']) entries.append(self.url_result( 'http://www.dcndigital.ae/media/%s' % video_id, 'DCNVideo', video_id)) return self.playlist_result(entries, season_id, title) ���������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/rutube.py�����������������������������������������������������������0000644�0000000�0000000�00000014501�12641030331�017705� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals import re import itertools from .common import InfoExtractor from ..compat import ( compat_str, ) from ..utils import ( determine_ext, unified_strdate, ) class RutubeIE(InfoExtractor): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' _VALID_URL = r'https?://rutube\.ru/(?:video|play/embed)/(?P<id>[\da-z]{32})' _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', 'ext': 'mp4', 'title': 'Раненный кенгуру забежал в аптеку', 'description': 'http://www.ntdtv.ru ', 'duration': 80, 'uploader': 'NTDRussian', 'uploader_id': '29790', 'upload_date': '20131016', 'age_limit': 0, }, 'params': { # It requires ffmpeg (m3u8 download) 'skip_download': True, }, }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( 'http://rutube.ru/api/video/%s/?format=json' % video_id, video_id, 'Downloading video JSON') # Some videos don't have the author field author = video.get('author') or {} options = self._download_json( 'http://rutube.ru/api/play/options/%s/?format=json' % video_id, video_id, 'Downloading options JSON') formats = [] for format_id, format_url in options['video_balancer'].items(): ext = determine_ext(format_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( format_url, video_id, f4m_id=format_id, fatal=False)) else: formats.append({ 'url': format_url, 'format_id': format_id, }) self._sort_formats(formats) return { 'id': video['id'], 'title': video['title'], 'description': video['description'], 'duration': video['duration'], 'view_count': video['hits'], 'formats': formats, 'thumbnail': video['thumbnail_url'], 'uploader': author.get('name'), 'uploader_id': compat_str(author['id']) if author else None, 'upload_date': unified_strdate(video['created_ts']), 'age_limit': 18 if video['is_adult'] else 0, } class RutubeEmbedIE(InfoExtractor): IE_NAME = 'rutube:embed' IE_DESC = 'Rutube embedded videos' _VALID_URL = 'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', 'info_dict': { 'id': 'a10e53b86e8f349080f718582ce4c661', 'ext': 'mp4', 'upload_date': '20131223', 'uploader_id': '297833', 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', 'uploader': 'subziro89 ILya', 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', }, 'params': { 'skip_download': 'Requires ffmpeg', }, }, { 'url': 'http://rutube.ru/play/embed/8083783', 'only_matching': True, }] def _real_extract(self, url): embed_id = self._match_id(url) webpage = self._download_webpage(url, embed_id) canonical_url = self._html_search_regex( r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage, 'Canonical URL') return self.url_result(canonical_url, 'Rutube') class RutubeChannelIE(InfoExtractor): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channels' _VALID_URL = r'http://rutube\.ru/tags/video/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/tags/video/1800/', 'info_dict': { 'id': '1800', }, 'playlist_mincount': 68, }] _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' def _extract_videos(self, channel_id, channel_title=None): entries = [] for pagenum in itertools.count(1): page = self._download_json( self._PAGE_TEMPLATE % (channel_id, pagenum), channel_id, 'Downloading page %s' % pagenum) results = page['results'] if not results: break entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results) if not page['has_next']: break return self.playlist_result(entries, channel_id, channel_title) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) channel_id = mobj.group('id') return self._extract_videos(channel_id) class RutubeMovieIE(RutubeChannelIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' _VALID_URL = r'http://rutube\.ru/metainfo/tv/(?P<id>\d+)' _TESTS = [] _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' def _real_extract(self, url): movie_id = self._match_id(url) movie = self._download_json( self._MOVIE_TEMPLATE % movie_id, movie_id, 'Downloading movie JSON') movie_name = movie['name'] return self._extract_videos(movie_id, movie_name) class RutubePersonIE(RutubeChannelIE): IE_NAME = 'rutube:person' IE_DESC = 'Rutube person videos' _VALID_URL = r'http://rutube\.ru/video/person/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/video/person/313878/', 'info_dict': { 'id': '313878', }, 'playlist_mincount': 37, }] _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/tlc.py��������������������������������������������������������������0000644�0000000�0000000�00000003547�12641460050�017176� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE from ..compat import compat_urlparse class TlcDeIE(InfoExtractor): IE_NAME = 'tlc.de' _VALID_URL = r'http://www\.tlc\.de/sendungen/[^/]+/videos/(?P<title>[^/?]+)' _TEST = { 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', 'info_dict': { 'id': '3235167922001', 'ext': 'mp4', 'title': 'Breaking Amish: Die Welt da draußen', 'uploader': 'Discovery Networks - Germany', 'description': ( 'Vier Amische und eine Mennonitin wagen in New York' ' den Sprung in ein komplett anderes Leben. Begleitet sie auf' ' ihrem spannenden Weg.'), }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) iframe_url = self._search_regex( '<iframe src="(http://www\.tlc\.de/wp-content/.+?)"', webpage, 'iframe url') # Otherwise we don't get the correct 'BrightcoveExperience' element, # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/ iframe_url = iframe_url.replace('.htm?', '.php?') url_fragment = compat_urlparse.urlparse(url).fragment if url_fragment: # Since the fragment is not send to the server, we always get the same iframe iframe_url = re.sub(r'playlist=(\d+)', 'playlist=%s' % url_fragment, iframe_url) iframe = self._download_webpage(iframe_url, title) return { '_type': 'url', 'url': BrightcoveLegacyIE._extract_brightcove_url(iframe), 'ie': BrightcoveLegacyIE.ie_key(), } ���������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/tass.py�������������������������������������������������������������0000644�0000000�0000000�00000003741�12641030331�017355� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals import json from .common import InfoExtractor from ..utils import ( js_to_json, qualities, ) class TassIE(InfoExtractor): _VALID_URL = r'https?://(?:tass\.ru|itar-tass\.com)/[^/]+/(?P<id>\d+)' _TESTS = [ { 'url': 'http://tass.ru/obschestvo/1586870', 'md5': '3b4cdd011bc59174596b6145cda474a4', 'info_dict': { 'id': '1586870', 'ext': 'mp4', 'title': 'Посетителям московского зоопарка показали красную панду', 'description': 'Приехавшую из Дублина Зейну можно увидеть в павильоне "Кошки тропиков"', 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://itar-tass.com/obschestvo/1600009', 'only_matching': True, }, ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) sources = json.loads(js_to_json(self._search_regex( r'(?s)sources\s*:\s*(\[.+?\])', webpage, 'sources'))) quality = qualities(['sd', 'hd']) formats = [] for source in sources: video_url = source.get('file') if not video_url or not video_url.startswith('http') or not video_url.endswith('.mp4'): continue label = source.get('label') formats.append({ 'url': video_url, 'format_id': label, 'quality': quality(label), }) self._sort_formats(formats) return { 'id': video_id, 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, } �������������������������������youtube-dl/youtube_dl/extractor/myvideo.py����������������������������������������������������������0000644�0000000�0000000�00000014206�12660177411�020071� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import binascii import base64 import hashlib import re import json from .common import InfoExtractor from ..compat import ( compat_ord, compat_urllib_parse, compat_urllib_parse_unquote, ) from ..utils import ( ExtractorError, sanitized_Request, ) class MyVideoIE(InfoExtractor): _WORKING = False _VALID_URL = r'http://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*' IE_NAME = 'myvideo' _TEST = { 'url': 'http://www.myvideo.de/watch/8229274/bowling_fail_or_win', 'md5': '2d2753e8130479ba2cb7e0a37002053e', 'info_dict': { 'id': '8229274', 'ext': 'flv', 'title': 'bowling-fail-or-win', } } # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git # Released into the Public Domain by Tristan Fischer on 2013-05-19 # https://github.com/rg3/youtube-dl/pull/842 def __rc4crypt(self, data, key): x = 0 box = list(range(256)) for i in list(range(256)): x = (x + box[i] + compat_ord(key[i % len(key)])) % 256 box[i], box[x] = box[x], box[i] x = 0 y = 0 out = '' for char in data: x = (x + 1) % 256 y = (y + box[x]) % 256 box[x], box[y] = box[y], box[x] out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256]) return out def __md5(self, s): return hashlib.md5(s).hexdigest().encode() def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') GK = ( b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt' b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3' b'TnpsbA0KTVRkbU1tSTRNdz09' ) # Get video webpage webpage_url = 'http://www.myvideo.de/watch/%s' % video_id webpage = self._download_webpage(webpage_url, video_id) mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage) if mobj is not None: self.report_extraction(video_id) video_url = mobj.group(1) + '.flv' video_title = self._html_search_regex('<title>([^<]+)', webpage, 'title') return { 'id': video_id, 'url': video_url, 'title': video_title, } mobj = re.search(r'data-video-service="/service/data/video/%s/config' % video_id, webpage) if mobj is not None: request = sanitized_Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '') response = self._download_webpage(request, video_id, 'Downloading video info') info = json.loads(base64.b64decode(response).decode('utf-8')) return { 'id': video_id, 'title': info['title'], 'url': info['streaming_url'].replace('rtmpe', 'rtmpt'), 'play_path': info['filename'], 'ext': 'flv', 'thumbnail': info['thumbnail'][0]['url'], } # try encxml mobj = re.search('var flashvars={(.+?)}', webpage) if mobj is None: raise ExtractorError('Unable to extract video') params = {} encxml = '' sec = mobj.group(1) for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec): if not a == '_encxml': params[a] = b else: encxml = compat_urllib_parse_unquote(b) if not params.get('domain'): params['domain'] = 'www.myvideo.de' xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params)) if 'flash_playertype=MTV' in xmldata_url: self._downloader.report_warning('avoiding MTV player') xmldata_url = ( 'http://www.myvideo.de/dynamic/get_player_video_xml.php' '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes' ) % video_id # get enc data enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1] enc_data_b = binascii.unhexlify(enc_data) sk = self.__md5( base64.b64decode(base64.b64decode(GK)) + self.__md5( str(video_id).encode('utf-8') ) ) dec_data = self.__rc4crypt(enc_data_b, sk) # extracting infos self.report_extraction(video_id) video_url = None mobj = re.search('connectionurl=\'(.*?)\'', dec_data) if mobj: video_url = compat_urllib_parse_unquote(mobj.group(1)) if 'myvideo2flash' in video_url: self.report_warning( 'Rewriting URL to use unencrypted rtmp:// ...', video_id) video_url = video_url.replace('rtmpe://', 'rtmp://') if not video_url: # extract non rtmp videos mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data) if mobj is None: raise ExtractorError('unable to extract url') video_url = compat_urllib_parse_unquote(mobj.group(1)) + compat_urllib_parse_unquote(mobj.group(2)) video_file = self._search_regex('source=\'(.*?)\'', dec_data, 'video file') video_file = compat_urllib_parse_unquote(video_file) if not video_file.endswith('f4m'): ppath, prefix = video_file.split('.') video_playpath = '%s:%s' % (prefix, ppath) else: video_playpath = '' video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj') video_swfobj = compat_urllib_parse_unquote(video_swfobj) video_title = self._html_search_regex("(.*?)

    ", webpage, 'title') return { 'id': video_id, 'url': video_url, 'tc_url': video_url, 'title': video_title, 'ext': 'flv', 'play_path': video_playpath, 'player_url': video_swfobj, } youtube-dl/youtube_dl/extractor/drtuber.py0000644000000000000000000000473112641030331020052 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import str_to_int class DrTuberIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?drtuber\.com/video/(?P\d+)/(?P[\w-]+)' _TEST = { 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf', 'md5': '93e680cf2536ad0dfb7e74d94a89facd', 'info_dict': { 'id': '1740434', 'display_id': 'hot-perky-blonde-naked-golf', 'ext': 'mp4', 'title': 'hot perky blonde naked golf', 'like_count': int, 'dislike_count': int, 'comment_count': int, 'categories': ['Babe', 'Blonde', 'Erotic', 'Outdoor', 'Softcore', 'Solo'], 'thumbnail': 're:https?://.*\.jpg$', 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) video_url = self._html_search_regex( r']+class="title_substrate">([^<]+)

    ', r'([^<]+) - \d+'], webpage, 'title') thumbnail = self._html_search_regex( r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) def extract_count(id_, name): return str_to_int(self._html_search_regex( r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_, webpage, '%s count' % name, fatal=False)) like_count = extract_count('rate_likes', 'like') dislike_count = extract_count('rate_dislikes', 'dislike') comment_count = extract_count('comments_count', 'comment') cats_str = self._search_regex( r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False) categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str) return { 'id': video_id, 'display_id': display_id, 'url': video_url, 'title': title, 'thumbnail': thumbnail, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, 'categories': categories, 'age_limit': self._rta_search(webpage), } ���������������������������������������youtube-dl/youtube_dl/extractor/oktoberfesttv.py����������������������������������������������������0000644�0000000�0000000�00000002734�12641030331�021305� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class OktoberfestTVIE(InfoExtractor): _VALID_URL = r'https?://www\.oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P<id>[^/?#]+)' _TEST = { 'url': 'http://www.oktoberfest-tv.de/de/kameras/video/hb-zelt', 'info_dict': { 'id': 'hb-zelt', 'ext': 'mp4', 'title': 're:^Live-Kamera: Hofbräuzelt [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'thumbnail': 're:^https?://.*\.jpg$', 'is_live': True, }, 'params': { 'skip_download': True, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._live_title(self._html_search_regex( r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title')) clip = self._search_regex( r"clip:\s*\{\s*url:\s*'([^']+)'", webpage, 'clip') ncurl = self._search_regex( r"netConnectionUrl:\s*'([^']+)'", webpage, 'rtmp base') video_url = ncurl + clip thumbnail = self._search_regex( r"canvas:\s*\{\s*backgroundImage:\s*'url\(([^)]+)\)'", webpage, 'thumbnail', fatal=False) return { 'id': video_id, 'title': title, 'url': video_url, 'ext': 'mp4', 'is_live': True, 'thumbnail': thumbnail, } ������������������������������������youtube-dl/youtube_dl/extractor/vidzi.py������������������������������������������������������������0000644�0000000�0000000�00000002215�12653633132�017536� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import smuggle_url class VidziIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)' _TEST = { 'url': 'http://vidzi.tv/cghql9yq6emu.html', 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', 'info_dict': { 'id': 'cghql9yq6emu', 'ext': 'mp4', 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', 'uploader': 'vidzi.tv', }, 'params': { # m3u8 download 'skip_download': True, }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') # Vidzi now uses jwplayer, which can be handled by GenericIE return { '_type': 'url_transparent', 'id': video_id, 'title': title, 'url': smuggle_url(url, {'to_generic': True}), 'ie_key': 'Generic', } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/vk.py���������������������������������������������������������������0000644�0000000�0000000�00000031106�12656352065�017040� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals import re import json from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, ) from ..utils import ( ExtractorError, orderedSet, sanitized_Request, str_to_int, unescapeHTML, unified_strdate, ) from .vimeo import VimeoIE from .pladform import PladformIE class VKIE(InfoExtractor): IE_NAME = 'vk' IE_DESC = 'VK' _VALID_URL = r'''(?x) https?:// (?: (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)| (?: (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| (?:www\.)?biqle\.ru/watch/ ) (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$) ) ''' _NETRC_MACHINE = 'vk' _TESTS = [ { 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', 'md5': '0deae91935c54e00003c2a00646315f0', 'info_dict': { 'id': '162222515', 'ext': 'flv', 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'duration': 195, 'upload_date': '20120212', 'view_count': int, }, }, { 'url': 'http://vk.com/video205387401_165548505', 'md5': '6c0aeb2e90396ba97035b9cbde548700', 'info_dict': { 'id': '165548505', 'ext': 'mp4', 'uploader': 'Tom Cruise', 'title': 'No name', 'duration': 9, 'upload_date': '20130721', 'view_count': int, } }, { 'note': 'Embedded video', 'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', 'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', 'info_dict': { 'id': '162925554', 'ext': 'mp4', 'uploader': 'Vladimir Gavrin', 'title': 'Lin Dan', 'duration': 101, 'upload_date': '20120730', 'view_count': int, } }, { # VIDEO NOW REMOVED # please update if you find a video whose URL follows the same pattern 'url': 'http://vk.com/video-8871596_164049491', 'md5': 'a590bcaf3d543576c9bd162812387666', 'note': 'Only available for registered users', 'info_dict': { 'id': '164049491', 'ext': 'mp4', 'uploader': 'Триллеры', 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', 'duration': 8352, 'upload_date': '20121218', 'view_count': int, }, 'skip': 'Requires vk account credentials', }, { 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', 'md5': '4d7a5ef8cf114dfa09577e57b2993202', 'info_dict': { 'id': '168067957', 'ext': 'mp4', 'uploader': 'Киномания - лучшее из мира кино', 'title': ' ', 'duration': 7291, 'upload_date': '20140328', }, 'skip': 'Requires vk account credentials', }, { 'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540', 'md5': '0c45586baa71b7cb1d0784ee3f4e00a6', 'note': 'ivi.ru embed', 'info_dict': { 'id': '60690', 'ext': 'mp4', 'title': 'Книга Илая', 'duration': 6771, 'upload_date': '20140626', 'view_count': int, }, 'skip': 'Only works from Russia', }, { # video (removed?) only available with list id 'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4', 'md5': '091287af5402239a1051c37ec7b92913', 'info_dict': { 'id': '171201961', 'ext': 'mp4', 'title': 'ТюменцевВВ_09.07.2015', 'uploader': 'Anton Ivanov', 'duration': 109, 'upload_date': '20150709', 'view_count': int, }, }, { # youtube embed 'url': 'https://vk.com/video276849682_170681728', 'info_dict': { 'id': 'V3K4mi0SYkc', 'ext': 'mp4', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', 'duration': 179, 'upload_date': '20130116', 'uploader': "Children's Joy Foundation", 'uploader_id': 'thecjf', 'view_count': int, }, }, { # removed video, just testing that we match the pattern 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', 'only_matching': True, }, { # age restricted video, requires vk account credentials 'url': 'https://vk.com/video205387401_164765225', 'only_matching': True, }, { # vk wrapper 'url': 'http://www.biqle.ru/watch/847655_160197695', 'only_matching': True, }, { # pladform embed 'url': 'https://vk.com/video-76116461_171554880', 'only_matching': True, } ] def _login(self): (username, password) = self._get_login_info() if username is None: return login_page = self._download_webpage( 'https://vk.com', None, 'Downloading login page') login_form = self._hidden_inputs(login_page) login_form.update({ 'email': username.encode('cp1251'), 'pass': password.encode('cp1251'), }) request = sanitized_Request( 'https://login.vk.com/?act=login', compat_urllib_parse.urlencode(login_form).encode('utf-8')) login_page = self._download_webpage( request, None, note='Logging in as %s' % username) if re.search(r'onLoginFailed', login_page): raise ExtractorError( 'Unable to login, incorrect username and/or password', expected=True) def _real_initialize(self): self._login() def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') if not video_id: video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id # Some videos (removed?) can only be downloaded with list id specified list_id = mobj.group('list_id') if list_id: info_url += '&list=%s' % list_id info_page = self._download_webpage(info_url, video_id) error_message = self._html_search_regex( r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', info_page, 'error message', default=None) if error_message: raise ExtractorError(error_message, expected=True) if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page): raise ExtractorError( 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', expected=True) ERRORS = { r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': 'Video %s has been removed from public access due to rightholder complaint.', r'<!>Please log in or <': 'Video %s is only available for registered users, ' 'use --username and --password options to provide account credentials.', r'<!>Unknown error': 'Video %s does not exist.', r'<!>Видео временно недоступно': 'Video %s is temporarily unavailable.', r'<!>Access denied': 'Access denied to video %s.', } for error_re, error_msg in ERRORS.items(): if re.search(error_re, info_page): raise ExtractorError(error_msg % video_id, expected=True) youtube_url = self._search_regex( r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', info_page, 'youtube iframe', default=None) if youtube_url: return self.url_result(youtube_url, 'Youtube') vimeo_url = VimeoIE._extract_vimeo_url(url, info_page) if vimeo_url is not None: return self.url_result(vimeo_url) pladform_url = PladformIE._extract_url(info_page) if pladform_url: return self.url_result(pladform_url) m_rutube = re.search( r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page) if m_rutube is not None: rutube_url = self._proto_relative_url( m_rutube.group(1).replace('\\', '')) return self.url_result(rutube_url) m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) if m_opts: m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) if m_opts_url: opts_url = m_opts_url.group(1) if opts_url.startswith('//'): opts_url = 'http:' + opts_url return self.url_result(opts_url) data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars') data = json.loads(data_json) # Extract upload date upload_date = None mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page) if mobj is not None: mobj.group(1) + ' ' + mobj.group(2) upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) view_count = None views = self._html_search_regex( r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', info_page, 'view count', fatal=False) if views: view_count = str_to_int(self._search_regex( r'([\d,.]+)', views, 'view count', fatal=False)) formats = [{ 'format_id': k, 'url': v, 'width': int(k[len('url'):]), } for k, v in data.items() if k.startswith('url')] self._sort_formats(formats) return { 'id': compat_str(data['vid']), 'formats': formats, 'title': unescapeHTML(data['md_title']), 'thumbnail': data.get('jpg'), 'uploader': data.get('md_author'), 'duration': data.get('duration'), 'upload_date': upload_date, 'view_count': view_count, } class VKUserVideosIE(InfoExtractor): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' _TEMPLATE_URL = 'https://vk.com/videos' _TESTS = [{ 'url': 'http://vk.com/videos205387401', 'info_dict': { 'id': '205387401', 'title': "Tom Cruise's Videos", }, 'playlist_mincount': 4, }, { 'url': 'http://vk.com/videos-77521', 'only_matching': True, }, { 'url': 'http://vk.com/videos-97664626?section=all', 'only_matching': True, }] def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) entries = [ self.url_result( 'http://vk.com/video' + video_id, 'VK', video_id=video_id) for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))] title = unescapeHTML(self._search_regex( r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos', webpage, 'title', default=page_id)) return self.playlist_result(entries, page_id, title) ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/allocine.py���������������������������������������������������������0000644�0000000�0000000�00000006732�12653633132�020207� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# -*- coding: utf-8 -*- from __future__ import unicode_literals import re import json from .common import InfoExtractor from ..compat import compat_str from ..utils import ( qualities, unescapeHTML, xpath_element, ) class AllocineIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?P<typ>article|video|film)/(fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=|video-)(?P<id>[0-9]+)(?:\.html)?' _TESTS = [{ 'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html', 'md5': '0c9fcf59a841f65635fa300ac43d8269', 'info_dict': { 'id': '19546517', 'ext': 'mp4', 'title': 'Astérix - Le Domaine des Dieux Teaser VF', 'description': 'md5:abcd09ce503c6560512c14ebfdb720d2', 'thumbnail': 're:http://.*\.jpg', }, }, { 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html', 'md5': 'd0cdce5d2b9522ce279fdfec07ff16e0', 'info_dict': { 'id': '19540403', 'ext': 'mp4', 'title': 'Planes 2 Bande-annonce VF', 'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway', 'thumbnail': 're:http://.*\.jpg', }, }, { 'url': 'http://www.allocine.fr/film/fichefilm_gen_cfilm=181290.html', 'md5': '101250fb127ef9ca3d73186ff22a47ce', 'info_dict': { 'id': '19544709', 'ext': 'mp4', 'title': 'Dragons 2 - Bande annonce finale VF', 'description': 'md5:601d15393ac40f249648ef000720e7e3', 'thumbnail': 're:http://.*\.jpg', }, }, { 'url': 'http://www.allocine.fr/video/video-19550147/', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) typ = mobj.group('typ') display_id = mobj.group('id') webpage = self._download_webpage(url, display_id) if typ == 'film': video_id = self._search_regex(r'href="/video/player_gen_cmedia=([0-9]+).+"', webpage, 'video id') else: player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player', default=None) if player: player_data = json.loads(player) video_id = compat_str(player_data['refMedia']) else: model = self._search_regex(r'data-model="([^"]+)">', webpage, 'data model') model_data = self._parse_json(unescapeHTML(model), display_id) video_id = compat_str(model_data['id']) xml = self._download_xml('http://www.allocine.fr/ws/AcVisiondataV4.ashx?media=%s' % video_id, display_id) video = xpath_element(xml, './/AcVisionVideo').attrib quality = qualities(['ld', 'md', 'hd']) formats = [] for k, v in video.items(): if re.match(r'.+_path', k): format_id = k.split('_')[0] formats.append({ 'format_id': format_id, 'quality': quality(format_id), 'url': v, }) self._sort_formats(formats) return { 'id': video_id, 'title': video['videoTitle'], 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'description': self._og_search_description(webpage), } ��������������������������������������youtube-dl/youtube_dl/extractor/tunein.py�����������������������������������������������������������0000644�0000000�0000000�00000012747�12641030331�017713� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import json from .common import InfoExtractor from ..utils import ExtractorError from ..compat import compat_urlparse class TuneInBaseIE(InfoExtractor): _API_BASE_URL = 'http://tunein.com/tuner/tune/' def _real_extract(self, url): content_id = self._match_id(url) content_info = self._download_json( self._API_BASE_URL + self._API_URL_QUERY % content_id, content_id, note='Downloading JSON metadata') title = content_info['Title'] thumbnail = content_info.get('Logo') location = content_info.get('Location') streams_url = content_info.get('StreamUrl') if not streams_url: raise ExtractorError('No downloadable streams found', expected=True) if not streams_url.startswith('http://'): streams_url = compat_urlparse.urljoin(url, streams_url) stream_data = self._download_webpage( streams_url, content_id, note='Downloading stream data') streams = json.loads(self._search_regex( r'\((.*)\);', stream_data, 'stream info'))['Streams'] is_live = None formats = [] for stream in streams: if stream.get('Type') == 'Live': is_live = True reliability = stream.get('Reliability') format_note = ( 'Reliability: %d%%' % reliability if reliability is not None else None) formats.append({ 'preference': ( 0 if reliability is None or reliability > 90 else 1), 'abr': stream.get('Bandwidth'), 'ext': stream.get('MediaType').lower(), 'acodec': stream.get('MediaType'), 'vcodec': 'none', 'url': stream.get('Url'), 'source_preference': reliability, 'format_note': format_note, }) self._sort_formats(formats) return { 'id': content_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'location': location, 'is_live': is_live, } class TuneInClipIE(TuneInBaseIE): IE_NAME = 'tunein:clip' _VALID_URL = r'https?://(?:www\.)?tunein\.com/station/.*?audioClipId\=(?P<id>\d+)' _API_URL_QUERY = '?tuneType=AudioClip&audioclipId=%s' _TESTS = [ { 'url': 'http://tunein.com/station/?stationId=246119&audioClipId=816', 'md5': '99f00d772db70efc804385c6b47f4e77', 'info_dict': { 'id': '816', 'title': '32m', 'ext': 'mp3', }, }, ] class TuneInStationIE(TuneInBaseIE): IE_NAME = 'tunein:station' _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId\=)(?P<id>\d+)' _API_URL_QUERY = '?tuneType=Station&stationId=%s' @classmethod def suitable(cls, url): return False if TuneInClipIE.suitable(url) else super(TuneInStationIE, cls).suitable(url) _TESTS = [ { 'url': 'http://tunein.com/radio/Jazz24-885-s34682/', 'info_dict': { 'id': '34682', 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', 'ext': 'mp3', 'location': 'Tacoma, WA', }, 'params': { 'skip_download': True, # live stream }, }, ] class TuneInProgramIE(TuneInBaseIE): IE_NAME = 'tunein:program' _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-p|program/.*?ProgramId\=)(?P<id>\d+)' _API_URL_QUERY = '?tuneType=Program&programId=%s' _TESTS = [ { 'url': 'http://tunein.com/radio/Jazz-24-p2506/', 'info_dict': { 'id': '2506', 'title': 'Jazz 24 on 91.3 WUKY-HD3', 'ext': 'mp3', 'location': 'Lexington, KY', }, 'params': { 'skip_download': True, # live stream }, }, ] class TuneInTopicIE(TuneInBaseIE): IE_NAME = 'tunein:topic' _VALID_URL = r'https?://(?:www\.)?tunein\.com/topic/.*?TopicId\=(?P<id>\d+)' _API_URL_QUERY = '?tuneType=Topic&topicId=%s' _TESTS = [ { 'url': 'http://tunein.com/topic/?TopicId=101830576', 'md5': 'c31a39e6f988d188252eae7af0ef09c9', 'info_dict': { 'id': '101830576', 'title': 'Votez pour moi du 29 octobre 2015 (29/10/15)', 'ext': 'mp3', 'location': 'Belgium', }, }, ] class TuneInShortenerIE(InfoExtractor): IE_NAME = 'tunein:shortener' IE_DESC = False # Do not list _VALID_URL = r'https?://tun\.in/(?P<id>[A-Za-z0-9]+)' _TEST = { # test redirection 'url': 'http://tun.in/ser7s', 'info_dict': { 'id': '34682', 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', 'ext': 'mp3', 'location': 'Tacoma, WA', }, 'params': { 'skip_download': True, # live stream }, } def _real_extract(self, url): redirect_id = self._match_id(url) # The server doesn't support HEAD requests urlh = self._request_webpage( url, redirect_id, note='Downloading redirect page') url = urlh.geturl() self.to_screen('Following redirect: %s' % url) return self.url_result(url) �������������������������youtube-dl/youtube_dl/extractor/ebaumsworld.py������������������������������������������������������0000644�0000000�0000000�00000002037�12641030331�020724� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor class EbaumsWorldIE(InfoExtractor): _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P<id>\d+)' _TEST = { 'url': 'http://www.ebaumsworld.com/video/watch/83367677/', 'info_dict': { 'id': '83367677', 'ext': 'mp4', 'title': 'A Giant Python Opens The Door', 'description': 'This is how nightmares start...', 'uploader': 'jihadpizza', }, } def _real_extract(self, url): video_id = self._match_id(url) config = self._download_xml( 'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id) video_url = config.find('file').text return { 'id': video_id, 'title': config.find('title').text, 'url': video_url, 'description': config.find('description').text, 'thumbnail': config.find('image').text, 'uploader': config.find('username').text, } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/commonmistakes.py���������������������������������������������������0000644�0000000�0000000�00000002555�12641030331�021436� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ExtractorError class CommonMistakesIE(InfoExtractor): IE_DESC = False # Do not list _VALID_URL = r'''(?x) (?:url|URL) ''' _TESTS = [{ 'url': 'url', 'only_matching': True, }, { 'url': 'URL', 'only_matching': True, }] def _real_extract(self, url): msg = ( 'You\'ve asked youtube-dl to download the URL "%s". ' 'That doesn\'t make any sense. ' 'Simply remove the parameter in your command or configuration.' ) % url if not self._downloader.params.get('verbose'): msg += ' Add -v to the command line to see what arguments and configuration youtube-dl got.' raise ExtractorError(msg, expected=True) class UnicodeBOMIE(InfoExtractor): IE_DESC = False _VALID_URL = r'(?P<bom>\ufeff)(?P<id>.*)$' _TESTS = [{ 'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc', 'only_matching': True, }] def _real_extract(self, url): real_url = self._match_id(url) self.report_warning( 'Your URL starts with a Byte Order Mark (BOM). ' 'Removing the BOM and looking for "%s" ...' % real_url) return self.url_result(real_url) ���������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/moevideo.py���������������������������������������������������������0000644�0000000�0000000�00000007173�12641030331�020215� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, ) class MoeVideoIE(InfoExtractor): IE_DESC = 'LetitBit video services: moevideo.net, playreplay.net and videochart.net' _VALID_URL = r'''(?x) https?://(?P<host>(?:www\.)? (?:(?:moevideo|playreplay|videochart)\.net))/ (?:video|framevideo)/(?P<id>[0-9]+\.[0-9A-Za-z]+)''' _API_URL = 'http://api.letitbit.net/' _API_KEY = 'tVL0gjqo5' _TESTS = [ { 'url': 'http://moevideo.net/video/00297.0036103fe3d513ef27915216fd29', 'md5': '129f5ae1f6585d0e9bb4f38e774ffb3a', 'info_dict': { 'id': '00297.0036103fe3d513ef27915216fd29', 'ext': 'flv', 'title': 'Sink cut out machine', 'description': 'md5:f29ff97b663aefa760bf7ca63c8ca8a8', 'thumbnail': 're:^https?://.*\.jpg$', 'width': 540, 'height': 360, 'duration': 179, 'filesize': 17822500, } }, { 'url': 'http://playreplay.net/video/77107.7f325710a627383d40540d8e991a', 'md5': '74f0a014d5b661f0f0e2361300d1620e', 'info_dict': { 'id': '77107.7f325710a627383d40540d8e991a', 'ext': 'flv', 'title': 'Operacion Condor.', 'description': 'md5:7e68cb2fcda66833d5081c542491a9a3', 'thumbnail': 're:^https?://.*\.jpg$', 'width': 480, 'height': 296, 'duration': 6027, 'filesize': 588257923, }, 'skip': 'Video has been removed', }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage( 'http://%s/video/%s' % (mobj.group('host'), video_id), video_id, 'Downloading webpage') title = self._og_search_title(webpage) thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage) r = [ self._API_KEY, [ 'preview/flv_link', { 'uid': video_id, }, ], ] r_json = json.dumps(r) post = compat_urllib_parse.urlencode({'r': r_json}) req = sanitized_Request(self._API_URL, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') response = self._download_json(req, video_id) if response['status'] != 'OK': raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, response['data']), expected=True ) item = response['data'][0] video_url = item['link'] duration = int_or_none(item['length']) width = int_or_none(item['width']) height = int_or_none(item['height']) filesize = int_or_none(item['convert_size']) formats = [{ 'format_id': 'sd', 'http_headers': {'Range': 'bytes=0-'}, # Required to download 'url': video_url, 'width': width, 'height': height, 'filesize': filesize, }] return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'description': description, 'duration': duration, 'formats': formats, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/stanfordoc.py�������������������������������������������������������0000644�0000000�0000000�00000006706�12641030331�020551� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, orderedSet, unescapeHTML, ) class StanfordOpenClassroomIE(InfoExtractor): IE_NAME = 'stanfordoc' IE_DESC = 'Stanford Open ClassRoom' _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' _TEST = { 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', 'md5': '544a9468546059d4e80d76265b0443b8', 'info_dict': { 'id': 'PracticalUnix_intro-environment', 'ext': 'mp4', 'title': 'Intro Environment', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj.group('course') and mobj.group('video'): # A specific video course = mobj.group('course') video = mobj.group('video') info = { 'id': course + '_' + video, 'uploader': None, 'upload_date': None, } baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' xmlUrl = baseUrl + video + '.xml' mdoc = self._download_xml(xmlUrl, info['id']) try: info['title'] = mdoc.findall('./title')[0].text info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text except IndexError: raise ExtractorError('Invalid metadata XML file') return info elif mobj.group('course'): # A course page course = mobj.group('course') info = { 'id': course, '_type': 'playlist', 'uploader': None, 'upload_date': None, } coursepage = self._download_webpage( url, info['id'], note='Downloading course info page', errnote='Unable to download course info page') info['title'] = self._html_search_regex( r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) info['description'] = self._html_search_regex( r'(?s)<description>([^<]+)</description>', coursepage, 'description', fatal=False) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['entries'] = [self.url_result( 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) ) for l in links] return info else: # Root page info = { 'id': 'Stanford OpenClassroom', '_type': 'playlist', 'uploader': None, 'upload_date': None, } info['title'] = info['id'] rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' rootpage = self._download_webpage(rootURL, info['id'], errnote='Unable to download course info page') links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) info['entries'] = [self.url_result( 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) ) for l in links] return info ����������������������������������������������������������youtube-dl/youtube_dl/extractor/sportbox.py���������������������������������������������������������0000644�0000000�0000000�00000010050�12641030331�020252� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( unified_strdate, ) class SportBoxIE(InfoExtractor): _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)' _TESTS = [{ 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', 'md5': 'ff56a598c2cf411a9a38a69709e97079', 'info_dict': { 'id': '80822', 'ext': 'mp4', 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad', 'thumbnail': 're:^https?://.*\.jpg$', 'upload_date': '20140928', }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', 'only_matching': True, }, { 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) player = self._search_regex( r'src="/?(vdl/player/[^"]+)"', webpage, 'player') title = self._html_search_regex( [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'], webpage, 'title') description = self._og_search_description(webpage) or self._html_search_meta( 'description', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._html_search_meta( 'dateCreated', webpage, 'upload date')) return { '_type': 'url_transparent', 'url': compat_urlparse.urljoin(url, '/%s' % player), 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, } class SportBoxEmbedIE(InfoExtractor): _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', 'info_dict': { 'id': '211355', 'ext': 'mp4', 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', 'thumbnail': 're:^https?://.*\.jpg$', }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580', 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"', webpage) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) hls = self._search_regex( r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]", webpage, 'hls file') formats = self._extract_m3u8_formats(hls, video_id, 'mp4') title = self._search_regex( r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title') thumbnail = self._search_regex( r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"', webpage, 'thumbnail', default=None) return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'formats': formats, } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/liveleak.py���������������������������������������������������������0000644�0000000�0000000�00000011514�12660177411�020210� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..utils import int_or_none class LiveLeakIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)' _TESTS = [{ 'url': 'http://www.liveleak.com/view?i=757_1364311680', 'md5': '50f79e05ba149149c1b4ea961223d5b3', 'info_dict': { 'id': '757_1364311680', 'ext': 'flv', 'description': 'extremely bad day for this guy..!', 'uploader': 'ljfriel2', 'title': 'Most unlucky car accident' } }, { 'url': 'http://www.liveleak.com/view?i=f93_1390833151', 'md5': 'b13a29626183c9d33944e6a04f41aafc', 'info_dict': { 'id': 'f93_1390833151', 'ext': 'mp4', 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', 'uploader': 'ARD_Stinkt', 'title': 'German Television does first Edward Snowden Interview (ENGLISH)', } }, { 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', 'md5': '42c6d97d54f1db107958760788c5f48f', 'info_dict': { 'id': '4f7_1392687779', 'ext': 'mp4', 'description': "The guy with the cigarette seems amazingly nonchalant about the whole thing... I really hope my friends' reactions would be a bit stronger.\r\n\r\nAction-go to 0:55.", 'uploader': 'CapObveus', 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck', 'age_limit': 18, } }, { # Covers https://github.com/rg3/youtube-dl/pull/5983 'url': 'http://www.liveleak.com/view?i=801_1409392012', 'md5': '0b3bec2d888c20728ca2ad3642f0ef15', 'info_dict': { 'id': '801_1409392012', 'ext': 'mp4', 'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.', 'uploader': 'bony333', 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia' } }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() video_description = self._og_search_description(webpage) video_uploader = self._html_search_regex( r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False) age_limit = int_or_none(self._search_regex( r'you confirm that you are ([0-9]+) years and over.', webpage, 'age limit', default=None)) sources_raw = self._search_regex( r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) if sources_raw is None: alt_source = self._search_regex( r'(file: ".*?"),', webpage, 'video URL', default=None) if alt_source: sources_raw = '[{ %s}]' % alt_source else: # Maybe an embed? embed_url = self._search_regex( r'<iframe[^>]+src="(http://www.prochan.com/embed\?[^"]+)"', webpage, 'embed URL') return { '_type': 'url_transparent', 'url': embed_url, 'id': video_id, 'title': video_title, 'description': video_description, 'uploader': video_uploader, 'age_limit': age_limit, } sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) sources = json.loads(sources_json) formats = [{ 'format_id': '%s' % i, 'format_note': s.get('label'), 'url': s['file'], } for i, s in enumerate(sources)] for i, s in enumerate(sources): # Removing '.h264_*.mp4' gives the raw video, which is essentially # the same video without the LiveLeak logo at the top (see # https://github.com/rg3/youtube-dl/pull/4768) orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file']) if s['file'] != orig_url: formats.append({ 'format_id': 'original-%s' % i, 'format_note': s.get('label'), 'url': orig_url, 'preference': 1, }) self._sort_formats(formats) return { 'id': video_id, 'title': video_title, 'description': video_description, 'uploader': video_uploader, 'formats': formats, 'age_limit': age_limit, } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/crackle.py����������������������������������������������������������0000644�0000000�0000000�00000006623�12657443441�020032� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import int_or_none class CrackleIE(InfoExtractor): _VALID_URL = r'(?:crackle:|https?://(?:www\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)' _TEST = { 'url': 'http://www.crackle.com/the-art-of-more/2496419', 'info_dict': { 'id': '2496419', 'ext': 'mp4', 'title': 'Heavy Lies the Head', 'description': 'md5:bb56aa0708fe7b9a4861535f15c3abca', }, 'params': { # m3u8 download 'skip_download': True, } } # extracted from http://legacyweb-us.crackle.com/flash/QueryReferrer.ashx _SUBTITLE_SERVER = 'http://web-us-az.crackle.com' _UPLYNK_OWNER_ID = 'e8773f7770a44dbd886eee4fca16a66b' _THUMBNAIL_TEMPLATE = 'http://images-us-am.crackle.com/%stnl_1920x1080.jpg?ts=20140107233116?c=635333335057637614' # extracted from http://legacyweb-us.crackle.com/flash/ReferrerRedirect.ashx _MEDIA_FILE_SLOTS = { 'c544.flv': { 'width': 544, 'height': 306, }, '360p.mp4': { 'width': 640, 'height': 360, }, '480p.mp4': { 'width': 852, 'height': 478, }, '480p_1mbps.mp4': { 'width': 852, 'height': 478, }, } def _real_extract(self, url): video_id = self._match_id(url) item = self._download_xml( 'http://legacyweb-us.crackle.com/app/revamp/vidwallcache.aspx?flags=-1&fm=%s' % video_id, video_id).find('i') title = item.attrib['t'] thumbnail = None subtitles = {} formats = self._extract_m3u8_formats( 'http://content.uplynk.com/ext/%s/%s.m3u8' % (self._UPLYNK_OWNER_ID, video_id), video_id, 'mp4', m3u8_id='hls', fatal=None) path = item.attrib.get('p') if path: thumbnail = self._THUMBNAIL_TEMPLATE % path http_base_url = 'http://ahttp.crackle.com/' + path for mfs_path, mfs_info in self._MEDIA_FILE_SLOTS.items(): formats.append({ 'url': http_base_url + mfs_path, 'format_id': 'http-' + mfs_path.split('.')[0], 'width': mfs_info['width'], 'height': mfs_info['height'], }) for cc in item.findall('cc'): locale = cc.attrib.get('l') v = cc.attrib.get('v') if locale and v: if locale not in subtitles: subtitles[locale] = [] subtitles[locale] = [{ 'url': '%s/%s%s_%s.xml' % (self._SUBTITLE_SERVER, path, locale, v), 'ext': 'ttml', }] self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) return { 'id': video_id, 'title': title, 'description': item.attrib.get('d'), 'duration': int(item.attrib.get('r'), 16) if item.attrib.get('r') else None, 'series': item.attrib.get('sn'), 'season_number': int_or_none(item.attrib.get('se')), 'episode_number': int_or_none(item.attrib.get('ep')), 'thumbnail': thumbnail, 'subtitles': subtitles, 'formats': formats, } �������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/cnet.py�������������������������������������������������������������0000644�0000000�0000000�00000007007�12641030331�017333� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .theplatform import ThePlatformIE from ..utils import int_or_none class CNETIE(ThePlatformIE): _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', 'ext': 'flv', 'title': 'Hands-on with Microsoft Windows 8.1 Update', 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', 'duration': 70, }, }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', 'info_dict': { 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', 'ext': 'flv', 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', 'duration': 1482, }, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( r"data-cnet-video(?:-uvp)?-options='([^']+)'", webpage, 'data json') data = self._parse_json(data_json, display_id) vdata = data.get('video') or data['videos'][0] video_id = vdata['id'] title = vdata['title'] author = vdata.get('author') if author: uploader = '%s %s' % (author['firstName'], author['lastName']) uploader_id = author.get('id') else: uploader = None uploader_id = None mpx_account = data['config']['uvpConfig']['default']['mpx_account'] metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id) description = vdata.get('description') or metadata.get('description') duration = int_or_none(vdata.get('duration')) or metadata.get('duration') formats = [] subtitles = {} for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid) if fkey == 'hds': release_url += '&manifest=f4m' tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': metadata.get('thumbnail'), 'duration': duration, 'uploader': uploader, 'uploader_id': uploader_id, 'subtitles': subtitles, 'formats': formats, } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/infoq.py������������������������������������������������������������0000644�0000000�0000000�00000010426�12641030331�017515� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import base64 from .common import InfoExtractor from ..compat import ( compat_urllib_parse_unquote, compat_parse_qs, ) from ..utils import determine_ext class InfoQIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things', 'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2', 'info_dict': { 'id': 'A-Few-of-My-Favorite-Python-Things', 'ext': 'mp4', 'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.', 'title': 'A Few of My Favorite [Python] Things', }, }, { 'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript', 'only_matching': True, }, { 'url': 'http://www.infoq.com/cn/presentations/openstack-continued-delivery', 'md5': '4918d0cca1497f2244572caf626687ef', 'info_dict': { 'id': 'openstack-continued-delivery', 'title': 'OpenStack持续交付之路', 'ext': 'flv', 'description': 'md5:308d981fb28fa42f49f9568322c683ff', }, }] def _extract_bokecc_videos(self, webpage, video_id): # TODO: bokecc.com is a Chinese video cloud platform # It should have an independent extractor but I don't have other # examples using bokecc player_params_str = self._html_search_regex( r'<script[^>]+src="http://p\.bokecc\.com/player\?([^"]+)', webpage, 'player params', default=None) player_params = compat_parse_qs(player_params_str) info_xml = self._download_xml( 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( player_params['siteid'][0], player_params['vid'][0]), video_id) return [{ 'format_id': 'bokecc', 'url': quality.find('./copy').attrib['playurl'], 'preference': int(quality.attrib['value']), } for quality in info_xml.findall('./video/quality')] def _extract_rtmp_videos(self, webpage): # The server URL is hardcoded video_url = 'rtmpe://video.infoq.com/cfx/st/' # Extract video URL encoded_id = self._search_regex( r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id', default=None) real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) playpath = 'mp4:' + real_id return [{ 'format_id': 'rtmp', 'url': video_url, 'ext': determine_ext(playpath), 'play_path': playpath, }] def _extract_http_videos(self, webpage): http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') policy = self._search_regex(r'InfoQConstants.scp\s*=\s*\'([^\']+)\'', webpage, 'policy') signature = self._search_regex(r'InfoQConstants.scs\s*=\s*\'([^\']+)\'', webpage, 'signature') key_pair_id = self._search_regex(r'InfoQConstants.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id') return [{ 'format_id': 'http', 'url': http_video_url, 'http_headers': { 'Cookie': 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( policy, signature, key_pair_id), }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_title = self._html_search_regex(r'<title>(.*?)', webpage, 'title') video_description = self._html_search_meta('description', webpage, 'description') if '/cn/' in url: # for China videos, HTTP video URL exists but always fails with 403 formats = self._extract_bokecc_videos(webpage, video_id) else: formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage) self._sort_formats(formats) return { 'id': video_id, 'title': video_title, 'description': video_description, 'formats': formats, } youtube-dl/youtube_dl/extractor/tapely.py0000644000000000000000000000742212641030331017701 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( clean_html, ExtractorError, float_or_none, parse_iso8601, sanitized_Request, ) class TapelyIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:tape\.ly|tapely\.com)/(?P[A-Za-z0-9\-_]+)(?:/(?P\d+))?' _API_URL = 'http://tape.ly/showtape?id={0:}' _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' _TESTS = [ { 'url': 'http://tape.ly/my-grief-as-told-by-water', 'info_dict': { 'id': 23952, 'title': 'my grief as told by water', 'thumbnail': 're:^https?://.*\.png$', 'uploader_id': 16484, 'timestamp': 1411848286, 'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.', }, 'playlist_count': 13, }, { 'url': 'http://tape.ly/my-grief-as-told-by-water/1', 'md5': '79031f459fdec6530663b854cbc5715c', 'info_dict': { 'id': 258464, 'title': 'Dreaming Awake (My Brightest Diamond)', 'ext': 'm4a', }, }, { 'url': 'https://tapely.com/my-grief-as-told-by-water', 'only_matching': True, }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') playlist_url = self._API_URL.format(display_id) request = sanitized_Request(playlist_url) request.add_header('X-Requested-With', 'XMLHttpRequest') request.add_header('Accept', 'application/json') request.add_header('Referer', url) playlist = self._download_json(request, display_id) tape = playlist['tape'] entries = [] for s in tape['songs']: song = s['song'] entry = { 'id': song['id'], 'duration': float_or_none(song.get('songduration'), 1000), 'title': song['title'], } if song['source'] == 'S3': entry.update({ 'url': self._S3_SONG_URL.format(song['filename']), }) entries.append(entry) elif song['source'] == 'YT': self.to_screen('YouTube video detected') yt_id = song['filename'].replace('/youtube/', '') entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id)) entries.append(entry) elif song['source'] == 'SC': self.to_screen('SoundCloud song detected') sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename']) entry.update(self.url_result(sc_url, 'Soundcloud')) entries.append(entry) else: self.report_warning('Unknown song source: %s' % song['source']) if mobj.group('songnr'): songnr = int(mobj.group('songnr')) - 1 try: return entries[songnr] except IndexError: raise ExtractorError( 'No song with index: %s' % mobj.group('songnr'), expected=True) return { '_type': 'playlist', 'id': tape['id'], 'display_id': display_id, 'title': tape['name'], 'entries': entries, 'thumbnail': tape.get('image_url'), 'description': clean_html(tape.get('subtext')), 'like_count': tape.get('likescount'), 'uploader_id': tape.get('user_id'), 'timestamp': parse_iso8601(tape.get('published_at')), } youtube-dl/youtube_dl/extractor/nba.py0000644000000000000000000000746512653373215017170 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( parse_duration, int_or_none, xpath_text, xpath_attr, ) class NBAIE(InfoExtractor): _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P(?:[^/]+/)?video/(?P[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', 'md5': '9e7729d3010a9c71506fd1248f74e4f4', 'info_dict': { 'id': '0021200253-okc-bkn-recap', 'ext': 'mp4', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, 'timestamp': 1354638466, 'upload_date': '20121204', }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, }, { 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { 'id': '0041400301-cle-atl-recap', 'ext': 'mp4', 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, 'timestamp': 1432134543, 'upload_date': '20150520', } }] def _real_extract(self, url): path, video_id = re.match(self._VALID_URL, url).groups() if path.startswith('nba/'): path = path[3:] video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id) video_id = xpath_text(video_info, 'slug') title = xpath_text(video_info, 'headline') description = xpath_text(video_info, 'description') duration = parse_duration(xpath_text(video_info, 'length')) timestamp = int_or_none(xpath_attr(video_info, 'dateCreated', 'uts')) thumbnails = [] for image in video_info.find('images'): thumbnails.append({ 'id': image.attrib.get('cut'), 'url': image.text, 'width': int_or_none(image.attrib.get('width')), 'height': int_or_none(image.attrib.get('height')), }) formats = [] for video_file in video_info.findall('.//file'): video_url = video_file.text if video_url.startswith('/'): continue if video_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) elif video_url.endswith('.f4m'): formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id, f4m_id='hds', fatal=False)) else: key = video_file.attrib.get('bitrate') format_info = { 'format_id': key, 'url': video_url, } mobj = re.search(r'(\d+)x(\d+)(?:_(\d+))?', key) if mobj: format_info.update({ 'width': int(mobj.group(1)), 'height': int(mobj.group(2)), 'tbr': int_or_none(mobj.group(3)), }) formats.append(format_info) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'duration': duration, 'timestamp': timestamp, 'thumbnails': thumbnails, 'formats': formats, } youtube-dl/youtube_dl/extractor/animeondemand.py0000644000000000000000000001342412660177411021215 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( determine_ext, encode_dict, ExtractorError, sanitized_Request, urlencode_postdata, ) class AnimeOnDemandIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P\d+)' _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' _TEST = { 'url': 'https://www.anime-on-demand.de/anime/161', 'info_dict': { 'id': '161', 'title': 'Grimgar, Ashes and Illusions (OmU)', 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', }, 'playlist_mincount': 4, } def _login(self): (username, password) = self._get_login_info() if username is None: return login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') login_form = self._form_hidden_inputs('new_user', login_page) login_form.update({ 'user[login]': username, 'user[password]': password, }) post_url = self._search_regex( r']+action=(["\'])(?P.+?)\1', login_page, 'post url', default=self._LOGIN_URL, group='url') if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) request = sanitized_Request( post_url, urlencode_postdata(encode_dict(login_form))) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( request, None, 'Logging in as %s' % username) if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): error = self._search_regex( r'

    (.+?)

    ', response, 'error', default=None) if error: raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') def _real_initialize(self): self._login() def _real_extract(self, url): anime_id = self._match_id(url) webpage = self._download_webpage(url, anime_id) if 'data-playlist=' not in webpage: self._download_webpage( self._APPLY_HTML5_URL, anime_id, 'Activating HTML5 beta', 'Unable to apply HTML5 beta') webpage = self._download_webpage(url, anime_id) csrf_token = self._html_search_meta( 'csrf-token', webpage, 'csrf token', fatal=True) anime_title = self._html_search_regex( r'(?s)]+itemprop="name"[^>]*>(.+?)', webpage, 'anime name') anime_description = self._html_search_regex( r'(?s)]+itemprop="description"[^>]*>(.+?)
    ', webpage, 'anime description', default=None) entries = [] for episode_html in re.findall(r'(?s)]+class="episodebox-title".+?>Episodeninhalt<', webpage): m = re.search( r'class="episodebox-title"[^>]+title="Episode (?P\d+) - (?P.+?)"', episode_html) if not m: continue episode_number = int(m.group('number')) episode_title = m.group('title') video_id = 'episode-%d' % episode_number common_info = { 'id': video_id, 'series': anime_title, 'episode': episode_title, 'episode_number': episode_number, } formats = [] playlist_url = self._search_regex( r'data-playlist=(["\'])(?P<url>.+?)\1', episode_html, 'data playlist', default=None, group='url') if playlist_url: request = sanitized_Request( compat_urlparse.urljoin(url, playlist_url), headers={ 'X-Requested-With': 'XMLHttpRequest', 'X-CSRF-Token': csrf_token, 'Referer': url, 'Accept': 'application/json, text/javascript, */*; q=0.01', }) playlist = self._download_json( request, video_id, 'Downloading playlist JSON', fatal=False) if playlist: playlist = playlist['playlist'][0] title = playlist['title'] description = playlist.get('description') for source in playlist.get('sources', []): file_ = source.get('file') if file_ and determine_ext(file_) == 'm3u8': formats = self._extract_m3u8_formats( file_, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if formats: f = common_info.copy() f.update({ 'title': title, 'description': description, 'formats': formats, }) entries.append(f) m = re.search( r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', episode_html) if m: f = common_info.copy() f.update({ 'id': '%s-teaser' % f['id'], 'title': m.group('title'), 'url': compat_urlparse.urljoin(url, m.group('href')), }) entries.append(f) return self.playlist_result(entries, anime_id, anime_title, anime_description) ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/lovehomeporn.py�����������������������������������������������������0000644�0000000�0000000�00000002242�12650650456�021133� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .nuevo import NuevoBaseIE class LoveHomePornIE(NuevoBaseIE): _VALID_URL = r'https?://(?:www\.)?lovehomeporn\.com/video/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' _TEST = { 'url': 'http://lovehomeporn.com/video/48483/stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick#menu', 'info_dict': { 'id': '48483', 'display_id': 'stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick', 'ext': 'mp4', 'title': 'Stunning busty brunette girlfriend sucking and riding a big dick', 'age_limit': 18, 'duration': 238.47, }, 'params': { 'skip_download': True, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') info = self._extract_nuevo( 'http://lovehomeporn.com/media/nuevo/config.php?key=%s' % video_id, video_id) info.update({ 'display_id': display_id, 'age_limit': 18 }) return info ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/xboxclips.py��������������������������������������������������������0000644�0000000�0000000�00000003637�12641030331�020422� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, parse_filesize, unified_strdate, ) class XboxClipsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})' _TEST = { 'url': 'https://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', 'info_dict': { 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', 'ext': 'mp4', 'title': 'Iabdulelah playing Titanfall', 'filesize_approx': 26800000, 'upload_date': '20140807', 'duration': 56, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._html_search_regex( r'>(?:Link|Download): <a[^>]+href="([^"]+)"', webpage, 'video URL') title = self._html_search_regex( r'<title>XboxClips \| ([^<]+)', webpage, 'title') upload_date = unified_strdate(self._html_search_regex( r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False)) filesize = parse_filesize(self._html_search_regex( r'>Size: ([^<]+)<', webpage, 'file size', fatal=False)) duration = int_or_none(self._html_search_regex( r'>Duration: (\d+) Seconds<', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( r'>Views: (\d+)<', webpage, 'view count', fatal=False)) return { 'id': video_id, 'url': video_url, 'title': title, 'upload_date': upload_date, 'filesize_approx': filesize, 'duration': duration, 'view_count': view_count, } youtube-dl/youtube_dl/extractor/pornoxo.py0000644000000000000000000000412012641030331020077 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( str_to_int, ) class PornoXOIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P\d+)/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', 'md5': '582f28ecbaa9e6e24cb90f50f524ce87', 'info_dict': { 'id': '7564', 'ext': 'flv', 'title': 'Striptease From Sexy Secretary!', 'description': 'Striptease From Sexy Secretary!', 'categories': list, # NSFW 'thumbnail': 're:https?://.*\.jpg$', 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) video_url = self._html_search_regex( r'\'file\'\s*:\s*"([^"]+)"', webpage, 'video_url') title = self._html_search_regex( r'([^<]+)\s*-\s*PornoXO', webpage, 'title') description = self._html_search_regex( r'<meta name="description" content="([^"]+)\s*featuring', webpage, 'description', fatal=False) thumbnail = self._html_search_regex( r'\'image\'\s*:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) view_count = str_to_int(self._html_search_regex( r'[vV]iews:\s*([0-9,]+)', webpage, 'view count', fatal=False)) categories_str = self._html_search_regex( r'<meta name="description" content=".*featuring\s*([^"]+)"', webpage, 'categories', fatal=False) categories = ( None if categories_str is None else categories_str.split(',')) return { 'id': video_id, 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'categories': categories, 'view_count': view_count, 'age_limit': 18, } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/twitter.py����������������������������������������������������������0000644�0000000�0000000�00000025036�12662564617�020134� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( float_or_none, xpath_text, remove_end, int_or_none, ExtractorError, sanitized_Request, ) class TwitterBaseIE(InfoExtractor): def _get_vmap_video_url(self, vmap_url, video_id): vmap_data = self._download_xml(vmap_url, video_id) return xpath_text(vmap_data, './/MediaFile').strip() class TwitterCardIE(TwitterBaseIE): IE_NAME = 'twitter:card' _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', # MD5 checksums are different in different places 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', 'title': 'TwitterCard', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 30.033, } }, { 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8', 'info_dict': { 'id': '623160978427936768', 'ext': 'mp4', 'title': 'TwitterCard', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 80.155, }, }, { 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', 'md5': 'd4724ffe6d2437886d004fa5de1043b3', 'info_dict': { 'id': 'dq4Oj5quskI', 'ext': 'mp4', 'title': 'Ubuntu 11.10 Overview', 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/', 'upload_date': '20111013', 'uploader': 'OMG! Ubuntu!', 'uploader_id': 'omgubuntu', }, 'add_ie': ['Youtube'], }, { 'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', 'md5': 'ab2745d0b0ce53319a534fccaa986439', 'info_dict': { 'id': 'iBb2x00UVlv', 'ext': 'mp4', 'upload_date': '20151113', 'uploader_id': '1189339351084113920', 'uploader': 'ArsenalTerje', 'title': 'Vine by ArsenalTerje', }, 'add_ie': ['Vine'], } ] def _real_extract(self, url): video_id = self._match_id(url) # Different formats served for different User-Agents USER_AGENTS = [ 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', # mp4 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', # webm ] config = None formats = [] for user_agent in USER_AGENTS: request = sanitized_Request(url) request.add_header('User-Agent', user_agent) webpage = self._download_webpage(request, video_id) iframe_url = self._html_search_regex( r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', webpage, 'video iframe', default=None) if iframe_url: return self.url_result(iframe_url) config = self._parse_json(self._html_search_regex( r'data-player-config="([^"]+)"', webpage, 'data player config'), video_id) if 'playlist' not in config: if 'vmapUrl' in config: formats.append({ 'url': self._get_vmap_video_url(config['vmapUrl'], video_id), }) break # same video regardless of UA continue video_url = config['playlist'][0]['source'] f = { 'url': video_url, } m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) if m: f.update({ 'width': int(m.group('width')), 'height': int(m.group('height')), }) formats.append(f) self._sort_formats(formats) thumbnail = config.get('posterImageUrl') duration = float_or_none(config.get('duration')) return { 'id': video_id, 'title': 'TwitterCard', 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, } class TwitterIE(InfoExtractor): IE_NAME = 'twitter' _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P<user_id>[^/]+)/status/(?P<id>\d+)' _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', # MD5 checksums are different in different places 'info_dict': { 'id': '643211948184596480', 'ext': 'mp4', 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 12.922, 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', 'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', 'info_dict': { 'id': '657991469417025536', 'ext': 'mp4', 'title': 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai', 'description': 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"', 'thumbnail': 're:^https?://.*\.png', 'uploader': 'Gifs', 'uploader_id': 'giphz', }, 'expected_warnings': ['height', 'width'], }, { 'url': 'https://twitter.com/starwars/status/665052190608723968', 'md5': '39b7199856dee6cd4432e72c74bc69d4', 'info_dict': { 'id': '665052190608723968', 'ext': 'mp4', 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', 'description': 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."', 'uploader_id': 'starwars', 'uploader': 'Star Wars', }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('user_id') twid = mobj.group('id') webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid) username = remove_end(self._og_search_title(webpage), ' on Twitter') title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames title = re.sub(r'\s+(https?://[^ ]+)', '', title) info = { 'uploader_id': user_id, 'uploader': username, 'webpage_url': url, 'description': '%s on Twitter: "%s"' % (username, description), 'title': username + ' - ' + title, } card_id = self._search_regex( r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url', default=None) if card_id: card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id info.update({ '_type': 'url_transparent', 'ie_key': 'TwitterCard', 'url': card_url, }) return info mobj = re.search(r'''(?x) <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s* <source[^>]+video-src="(?P<url>[^"]+)" ''', webpage) if mobj: more_info = mobj.group('more_info') height = int_or_none(self._search_regex( r'data-height="(\d+)"', more_info, 'height', fatal=False)) width = int_or_none(self._search_regex( r'data-width="(\d+)"', more_info, 'width', fatal=False)) thumbnail = self._search_regex( r'poster="([^"]+)"', more_info, 'poster', fatal=False) info.update({ 'id': twid, 'url': mobj.group('url'), 'height': height, 'width': width, 'thumbnail': thumbnail, }) return info raise ExtractorError('There\'s no video in this tweet.') class TwitterAmplifyIE(TwitterBaseIE): IE_NAME = 'twitter:amplify' _VALID_URL = 'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-]{36})' _TEST = { 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', 'md5': '7df102d0b9fd7066b86f3159f8e81bf6', 'info_dict': { 'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', 'ext': 'mp4', 'title': 'Twitter Video', 'thumbnail': 're:^https?://.*', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) vmap_url = self._html_search_meta( 'twitter:amplify:vmap', webpage, 'vmap url') video_url = self._get_vmap_video_url(vmap_url, video_id) thumbnails = [] thumbnail = self._html_search_meta( 'twitter:image:src', webpage, 'thumbnail', fatal=False) def _find_dimension(target): w = int_or_none(self._html_search_meta( 'twitter:%s:width' % target, webpage, fatal=False)) h = int_or_none(self._html_search_meta( 'twitter:%s:height' % target, webpage, fatal=False)) return w, h if thumbnail: thumbnail_w, thumbnail_h = _find_dimension('image') thumbnails.append({ 'url': thumbnail, 'width': thumbnail_w, 'height': thumbnail_h, }) video_w, video_h = _find_dimension('player') formats = [{ 'url': video_url, 'width': video_w, 'height': video_h, }] return { 'id': video_id, 'title': 'Twitter Video', 'formats': formats, 'thumbnails': thumbnails, } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/voicerepublic.py����������������������������������������������������0000644�0000000�0000000�00000007314�12641030331�021236� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( ExtractorError, determine_ext, int_or_none, sanitized_Request, ) class VoiceRepublicIE(InfoExtractor): _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)' _TESTS = [{ 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', 'md5': '0554a24d1657915aa8e8f84e15dc9353', 'info_dict': { 'id': '2296', 'display_id': 'watching-the-watchers-building-a-sousveillance-state', 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', 'description': 'md5:715ba964958afa2398df615809cfecb1', 'thumbnail': 're:^https?://.*\.(?:png|jpg)$', 'duration': 1800, 'view_count': int, } }, { 'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state', 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) req = sanitized_Request( compat_urlparse.urljoin(url, '/talks/%s' % display_id)) # Older versions of Firefox get redirected to an "upgrade browser" page req.add_header('User-Agent', 'youtube-dl') webpage = self._download_webpage(req, display_id) if '>Queued for processing, please stand by...<' in webpage: raise ExtractorError( 'Audio is still queued for processing', expected=True) config = self._search_regex( r'(?s)return ({.+?});\s*\n', webpage, 'data', default=None) data = self._parse_json(config, display_id, fatal=False) if config else None if data: title = data['title'] description = data.get('teaser') talk_id = data.get('talk_id') or display_id talk = data['talk'] duration = int_or_none(talk.get('duration')) formats = [{ 'url': compat_urlparse.urljoin(url, talk_url), 'format_id': format_id, 'ext': determine_ext(talk_url) or format_id, 'vcodec': 'none', } for format_id, talk_url in talk['links'].items()] else: title = self._og_search_title(webpage) description = self._html_search_regex( r"(?s)<div class='talk-teaser'[^>]*>(.+?)</div>", webpage, 'description', fatal=False) talk_id = self._search_regex( [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"], webpage, 'talk id', default=None) or display_id duration = None player = self._search_regex( r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player') formats = [{ 'url': compat_urlparse.urljoin(url, talk_url), 'format_id': format_id, 'ext': determine_ext(talk_url) or format_id, 'vcodec': 'none', } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)] self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) view_count = int_or_none(self._search_regex( r"class='play-count[^']*'>\s*(\d+) plays", webpage, 'play count', fatal=False)) return { 'id': talk_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, 'formats': formats, } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/motorsport.py�������������������������������������������������������0000644�0000000�0000000�00000003405�12641030331�020630� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_urlparse, ) class MotorsportIE(InfoExtractor): IE_DESC = 'motorsport.com' _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/', 'info_dict': { 'id': '2-T3WuR-KMM', 'ext': 'mp4', 'title': 'Red Bull Racing: 2014 Rules Explained', 'duration': 208, 'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations – which are arguably the most complex the sport has ever seen.', 'uploader': 'mcomstaff', 'uploader_id': 'UC334JIYKkVnyFoNCclfZtHQ', 'upload_date': '20140903', 'thumbnail': r're:^https?://.+\.jpg$' }, 'add_ie': ['Youtube'], 'params': { 'skip_download': True, }, } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) iframe_path = self._html_search_regex( r'<iframe id="player_iframe"[^>]+src="([^"]+)"', webpage, 'iframe path') iframe = self._download_webpage( compat_urlparse.urljoin(url, iframe_path), display_id, 'Downloading iframe') youtube_id = self._search_regex( r'www.youtube.com/embed/(.{11})', iframe, 'youtube id') return { '_type': 'url_transparent', 'display_id': display_id, 'url': 'https://youtube.com/watch?v=%s' % youtube_id, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/primesharetv.py�����������������������������������������������������0000644�0000000�0000000�00000003531�12641030331�021111� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, sanitized_Request, ) class PrimeShareTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?primeshare\.tv/download/(?P<id>[\da-zA-Z]+)' _TEST = { 'url': 'http://primeshare.tv/download/238790B611', 'md5': 'b92d9bf5461137c36228009f31533fbc', 'info_dict': { 'id': '238790B611', 'ext': 'mp4', 'title': 'Public Domain - 1960s Commercial - Crest Toothpaste-YKsuFona', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) if '>File not exist<' in webpage: raise ExtractorError('Video %s does not exist' % video_id, expected=True) fields = self._hidden_inputs(webpage) headers = { 'Referer': url, 'Content-Type': 'application/x-www-form-urlencoded', } wait_time = int(self._search_regex( r'var\s+cWaitTime\s*=\s*(\d+)', webpage, 'wait time', default=7)) + 1 self._sleep(wait_time, video_id) req = sanitized_Request( url, compat_urllib_parse.urlencode(fields), headers) video_page = self._download_webpage( req, video_id, 'Downloading video page') video_url = self._search_regex( r"url\s*:\s*'([^']+\.primeshare\.tv(?::443)?/file/[^']+)'", video_page, 'video url') title = self._html_search_regex( r'<h1>Watch\s*(?: )?\s*\((.+?)(?:\s*\[\.\.\.\])?\)\s*(?: )?\s*<strong>', video_page, 'title') return { 'id': video_id, 'url': video_url, 'title': title, 'ext': 'mp4', } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/hotstar.py����������������������������������������������������������0000644�0000000�0000000�00000006145�12656652211�020105� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, determine_ext, int_or_none, ) class HotStarIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})' _TESTS = [{ 'url': 'http://www.hotstar.com/on-air-with-aib--english-1000076273', 'info_dict': { 'id': '1000076273', 'ext': 'mp4', 'title': 'On Air With AIB - English', 'description': 'md5:c957d8868e9bc793ccb813691cc4c434', 'timestamp': 1447227000, 'upload_date': '20151111', 'duration': 381, }, 'params': { # m3u8 download 'skip_download': True, } }, { 'url': 'http://www.hotstar.com/sports/cricket/rajitha-sizzles-on-debut-with-329/2001477583', 'only_matching': True, }, { 'url': 'http://www.hotstar.com/1000000515', 'only_matching': True, }] _GET_CONTENT_TEMPLATE = 'http://account.hotstar.com/AVS/besc?action=GetAggregatedContentDetails&channel=PCTV&contentId=%s' _GET_CDN_TEMPLATE = 'http://getcdn.hotstar.com/AVS/besc?action=GetCDN&asJson=Y&channel=%s&id=%s&type=%s' def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', fatal=True): json_data = super(HotStarIE, self)._download_json(url_or_request, video_id, note, fatal=fatal) if json_data['resultCode'] != 'OK': if fatal: raise ExtractorError(json_data['errorDescription']) return None return json_data['resultObj'] def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( self._GET_CONTENT_TEMPLATE % video_id, video_id)['contentInfo'][0] formats = [] # PCTV for extracting f4m manifest for f in ('TABLET',): format_data = self._download_json( self._GET_CDN_TEMPLATE % (f, video_id, 'VOD'), video_id, 'Downloading %s JSON metadata' % f, fatal=False) if format_data: format_url = format_data['src'] ext = determine_ext(format_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) elif ext == 'f4m': # produce broken files continue else: formats.append({ 'url': format_url, 'width': int_or_none(format_data.get('width')), 'height': int_or_none(format_data.get('height')), }) self._sort_formats(formats) return { 'id': video_id, 'title': video_data['episodeTitle'], 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': int_or_none(video_data.get('broadcastDate')), 'formats': formats, } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/nextmedia.py��������������������������������������������������������0000644�0000000�0000000�00000015221�12641030331�020355� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import parse_iso8601 class NextMediaIE(InfoExtractor): IE_DESC = '蘋果日報' _VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199', 'md5': 'dff9fad7009311c421176d1ac90bfe4f', 'info_dict': { 'id': '53109199', 'ext': 'mp4', 'title': '【佔領金鐘】50外國領事議員撐場 讚學生勇敢香港有希望', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'md5:28222b9912b6665a21011b034c70fcc7', 'timestamp': 1415456273, 'upload_date': '20141108', } }] _URL_PATTERN = r'\{ url: \'(.+)\' \}' def _real_extract(self, url): news_id = self._match_id(url) page = self._download_webpage(url, news_id) return self._extract_from_nextmedia_page(news_id, url, page) def _extract_from_nextmedia_page(self, news_id, url, page): title = self._fetch_title(page) video_url = self._search_regex(self._URL_PATTERN, page, 'video url') attrs = { 'id': news_id, 'title': title, 'url': video_url, # ext can be inferred from url 'thumbnail': self._fetch_thumbnail(page), 'description': self._fetch_description(page), } timestamp = self._fetch_timestamp(page) if timestamp: attrs['timestamp'] = timestamp else: attrs['upload_date'] = self._fetch_upload_date(url) return attrs def _fetch_title(self, page): return self._og_search_title(page) def _fetch_thumbnail(self, page): return self._og_search_thumbnail(page) def _fetch_timestamp(self, page): dateCreated = self._search_regex('"dateCreated":"([^"]+)"', page, 'created time') return parse_iso8601(dateCreated) def _fetch_upload_date(self, url): return self._search_regex(self._VALID_URL, url, 'upload date', group='date') def _fetch_description(self, page): return self._og_search_property('description', page) class NextMediaActionNewsIE(NextMediaIE): IE_DESC = '蘋果日報 - 動新聞' _VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+' _TESTS = [{ 'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460', 'md5': '05fce8ffeed7a5e00665d4b7cf0f9201', 'info_dict': { 'id': '19009428', 'ext': 'mp4', 'title': '【壹週刊】細10年男友偷食 50歲邵美琪再失戀', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'md5:cd802fad1f40fd9ea178c1e2af02d659', 'timestamp': 1421791200, 'upload_date': '20150120', } }] def _real_extract(self, url): news_id = self._match_id(url) actionnews_page = self._download_webpage(url, news_id) article_url = self._og_search_url(actionnews_page) article_page = self._download_webpage(article_url, news_id) return self._extract_from_nextmedia_page(news_id, url, article_page) class AppleDailyIE(NextMediaIE): IE_DESC = '臺灣蘋果日報' _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', 'info_dict': { 'id': '36354694', 'ext': 'mp4', 'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4', 'upload_date': '20150128', } }, { 'url': 'http://www.appledaily.com.tw/realtimenews/article/strange/20150128/550549/%E4%B8%8D%E6%BB%BF%E8%A2%AB%E8%B8%A9%E8%85%B3%E3%80%80%E5%B1%B1%E6%9D%B1%E5%85%A9%E5%A4%A7%E5%AA%BD%E4%B8%80%E8%B7%AF%E6%89%93%E4%B8%8B%E8%BB%8A', 'md5': '86b4e9132d158279c7883822d94ccc49', 'info_dict': { 'id': '550549', 'ext': 'mp4', 'title': '不滿被踩腳 山東兩大媽一路打下車', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'md5:175b4260c1d7c085993474217e4ab1b4', 'upload_date': '20150128', } }, { 'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671', 'md5': '03df296d95dedc2d5886debbb80cb43f', 'info_dict': { 'id': '5003671', 'ext': 'mp4', 'title': '20正妹熱舞 《刀龍傳說Online》火辣上市', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'md5:23c0aac567dc08c9c16a3161a2c2e3cd', 'upload_date': '20150128', }, 'skip': 'redirect to http://www.appledaily.com.tw/animation/', }, { # No thumbnail 'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003673/', 'md5': 'b06182cd386ea7bc6115ec7ff0f72aeb', 'info_dict': { 'id': '5003673', 'ext': 'mp4', 'title': '半夜尿尿 好像會看到___', 'description': 'md5:61d2da7fe117fede148706cdb85ac066', 'upload_date': '20150128', }, 'expected_warnings': [ 'video thumbnail', ], 'skip': 'redirect to http://www.appledaily.com.tw/animation/', }, { 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/', 'md5': 'eaa20e6b9df418c912d7f5dec2ba734d', 'info_dict': { 'id': '35770334', 'ext': 'mp4', 'title': '咖啡占卜測 XU裝熟指數', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'md5:7b859991a6a4fedbdf3dd3b66545c748', 'upload_date': '20140417', }, }] _URL_PATTERN = r'\{url: \'(.+)\'\}' def _fetch_title(self, page): return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or self._html_search_meta('description', page, 'news title')) def _fetch_thumbnail(self, page): return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) def _fetch_timestamp(self, page): return None def _fetch_description(self, page): return self._html_search_meta('description', page, 'news description') �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/vier.py�������������������������������������������������������������0000644�0000000�0000000�00000010060�12641030331�017340� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re import itertools from .common import InfoExtractor class VierIE(InfoExtractor): IE_NAME = 'vier' _VALID_URL = r'https?://(?:www\.)?vier\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' _TESTS = [{ 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', 'info_dict': { 'id': '16129', 'display_id': 'het-wordt-warm-de-moestuin', 'ext': 'mp4', 'title': 'Het wordt warm in De Moestuin', 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen', 'only_matching': True, }, { 'url': 'http://www.vier.be/video/v3/embed/16129', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) embed_id = mobj.group('embed_id') display_id = mobj.group('display_id') or embed_id webpage = self._download_webpage(url, display_id) video_id = self._search_regex( [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], webpage, 'video id') application = self._search_regex( [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], webpage, 'application', default='vier_vod') filename = self._search_regex( [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], webpage, 'filename') playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename) formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4') title = self._og_search_title(webpage, default=display_id) description = self._og_search_description(webpage, default=None) thumbnail = self._og_search_thumbnail(webpage, default=None) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'formats': formats, } class VierVideosIE(InfoExtractor): IE_NAME = 'vier:videos' _VALID_URL = r'https?://(?:www\.)?vier\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' _TESTS = [{ 'url': 'http://www.vier.be/demoestuin/videos', 'info_dict': { 'id': 'demoestuin', }, 'playlist_mincount': 153, }, { 'url': 'http://www.vier.be/demoestuin/videos?page=6', 'info_dict': { 'id': 'demoestuin-page6', }, 'playlist_mincount': 20, }, { 'url': 'http://www.vier.be/demoestuin/videos?page=7', 'info_dict': { 'id': 'demoestuin-page7', }, 'playlist_mincount': 13, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) program = mobj.group('program') page_id = mobj.group('page') if page_id: page_id = int(page_id) start_page = page_id playlist_id = '%s-page%d' % (program, page_id) else: start_page = 0 playlist_id = program entries = [] for current_page_id in itertools.count(start_page): current_page = self._download_webpage( 'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id), program, 'Downloading page %d' % (current_page_id + 1)) page_entries = [ self.url_result('http://www.vier.be' + video_url, 'Vier') for video_url in re.findall( r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] entries.extend(page_entries) if page_id or '>Meer<' not in current_page: break return self.playlist_result(entries, playlist_id) ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/beatportpro.py������������������������������������������������������0000644�0000000�0000000�00000006537�12641030331�020752� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_str from ..utils import int_or_none class BeatportProIE(InfoExtractor): _VALID_URL = r'https?://pro\.beatport\.com/track/(?P<display_id>[^/]+)/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://pro.beatport.com/track/synesthesia-original-mix/5379371', 'md5': 'b3c34d8639a2f6a7f734382358478887', 'info_dict': { 'id': '5379371', 'display_id': 'synesthesia-original-mix', 'ext': 'mp4', 'title': 'Froxic - Synesthesia (Original Mix)', }, }, { 'url': 'https://pro.beatport.com/track/love-and-war-original-mix/3756896', 'md5': 'e44c3025dfa38c6577fbaeb43da43514', 'info_dict': { 'id': '3756896', 'display_id': 'love-and-war-original-mix', 'ext': 'mp3', 'title': 'Wolfgang Gartner - Love & War (Original Mix)', }, }, { 'url': 'https://pro.beatport.com/track/birds-original-mix/4991738', 'md5': 'a1fd8e8046de3950fd039304c186c05f', 'info_dict': { 'id': '4991738', 'display_id': 'birds-original-mix', 'ext': 'mp4', 'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)", } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) track_id = mobj.group('id') display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) playables = self._parse_json( self._search_regex( r'window\.Playables\s*=\s*({.+?});', webpage, 'playables info', flags=re.DOTALL), track_id) track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] if track['mix']: title += ' (' + track['mix'] + ')' formats = [] for ext, info in track['preview'].items(): if not info['url']: continue fmt = { 'url': info['url'], 'ext': ext, 'format_id': ext, 'vcodec': 'none', } if ext == 'mp3': fmt['preference'] = 0 fmt['acodec'] = 'mp3' fmt['abr'] = 96 fmt['asr'] = 44100 elif ext == 'mp4': fmt['preference'] = 1 fmt['acodec'] = 'aac' fmt['abr'] = 96 fmt['asr'] = 44100 formats.append(fmt) self._sort_formats(formats) images = [] for name, info in track['images'].items(): image_url = info.get('url') if name == 'dynamic' or not image_url: continue image = { 'id': name, 'url': image_url, 'height': int_or_none(info.get('height')), 'width': int_or_none(info.get('width')), } images.append(image) return { 'id': compat_str(track.get('id')) or track_id, 'display_id': track.get('slug') or display_id, 'title': title, 'formats': formats, 'thumbnails': images, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/freespeech.py�������������������������������������������������������0000644�0000000�0000000�00000002312�12641030331�020505� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re import json from .common import InfoExtractor class FreespeechIE(InfoExtractor): IE_NAME = 'freespeech.org' _VALID_URL = r'https://www\.freespeech\.org/video/(?P<title>.+)' _TEST = { 'add_ie': ['Youtube'], 'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0', 'info_dict': { 'id': 'poKsVCZ64uU', 'ext': 'mp4', 'title': 'Obama, Romney Campaign in Colorado Ahead of Debate', 'description': 'Obama, Romney Campaign in Colorado Ahead of Debate', 'uploader': 'freespeechtv', 'uploader_id': 'freespeechtv', 'upload_date': '20121002', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) info_json = self._search_regex(r'jQuery.extend\(Drupal.settings, ({.*?})\);', webpage, 'info') info = json.loads(info_json) return { '_type': 'url', 'url': info['jw_player']['basic_video_node_player']['file'], 'ie_key': 'Youtube', } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/svt.py��������������������������������������������������������������0000644�0000000�0000000�00000007700�12650650456�017236� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( determine_ext, ) class SVTBaseIE(InfoExtractor): def _extract_video(self, url, video_id): info = self._download_json(url, video_id) title = info['context']['title'] thumbnail = info['context'].get('thumbnailImage') video_info = info['video'] formats = [] for vr in video_info['videoReferences']: vurl = vr['url'] ext = determine_ext(vurl) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( vurl, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=vr.get('playerType'))) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( vurl + '?hdcore=3.3.0', video_id, f4m_id=vr.get('playerType'))) else: formats.append({ 'format_id': vr.get('playerType'), 'url': vurl, }) self._sort_formats(formats) subtitles = {} subtitle_references = video_info.get('subtitleReferences') if isinstance(subtitle_references, list): for sr in subtitle_references: subtitle_url = sr.get('url') if subtitle_url: subtitles.setdefault('sv', []).append({'url': subtitle_url}) duration = video_info.get('materialLength') age_limit = 18 if video_info.get('inappropriateForChildren') else 0 return { 'id': video_id, 'title': title, 'formats': formats, 'subtitles': subtitles, 'thumbnail': thumbnail, 'duration': duration, 'age_limit': age_limit, } class SVTIE(SVTBaseIE): _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)' _TEST = { 'url': 'http://www.svt.se/wd?widgetId=23991§ionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false', 'md5': '9648197555fc1b49e3dc22db4af51d46', 'info_dict': { 'id': '2900353', 'ext': 'flv', 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', 'duration': 27, 'age_limit': 0, }, } @staticmethod def _extract_url(webpage): mobj = re.search( r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage) if mobj: return mobj.group('url') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) widget_id = mobj.group('widget_id') article_id = mobj.group('id') return self._extract_video( 'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id), article_id) class SVTPlayIE(SVTBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', 'info_dict': { 'id': '5996901', 'ext': 'mp4', 'title': 'Flygplan till Haile Selassie', 'duration': 3527, 'thumbnail': 're:^https?://.*[\.-]jpg$', 'age_limit': 0, 'subtitles': { 'sv': [{ 'ext': 'wsrt', }] }, }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') host = mobj.group('host') return self._extract_video( 'http://www.%s.se/video/%s?output=json' % (host, video_id), video_id) ����������������������������������������������������������������youtube-dl/youtube_dl/extractor/iconosquare.py������������������������������������������������������0000644�0000000�0000000�00000005645�12641030331�020740� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, get_element_by_id, remove_end, ) class IconosquareIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P<id>[^/]+)' _TEST = { 'url': 'http://statigr.am/p/522207370455279102_24101272', 'md5': '6eb93b882a3ded7c378ee1d6884b1814', 'info_dict': { 'id': '522207370455279102_24101272', 'ext': 'mp4', 'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)', 'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d', 'timestamp': 1376471991, 'upload_date': '20130814', 'uploader': 'aguynamedpatrick', 'uploader_id': '24101272', 'comment_count': int, 'like_count': int, }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) media = self._parse_json( get_element_by_id('mediaJson', webpage), video_id) formats = [{ 'url': f['url'], 'format_id': format_id, 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')) } for format_id, f in media['videos'].items()] self._sort_formats(formats) title = remove_end(self._og_search_title(webpage), ' - via Iconosquare') timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time')) description = media.get('caption', {}).get('text') uploader = media.get('user', {}).get('username') uploader_id = media.get('user', {}).get('id') comment_count = int_or_none(media.get('comments', {}).get('count')) like_count = int_or_none(media.get('likes', {}).get('count')) thumbnails = [{ 'url': t['url'], 'id': thumbnail_id, 'width': int_or_none(t.get('width')), 'height': int_or_none(t.get('height')) } for thumbnail_id, t in media.get('images', {}).items()] comments = [{ 'id': comment.get('id'), 'text': comment['text'], 'timestamp': int_or_none(comment.get('created_time')), 'author': comment.get('from', {}).get('full_name'), 'author_id': comment.get('from', {}).get('username'), } for comment in media.get('comments', {}).get('data', []) if 'text' in comment] return { 'id': video_id, 'title': title, 'description': description, 'thumbnails': thumbnails, 'timestamp': timestamp, 'uploader': uploader, 'uploader_id': uploader_id, 'comment_count': comment_count, 'like_count': like_count, 'formats': formats, 'comments': comments, } �������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/camdemy.py����������������������������������������������������������0000644�0000000�0000000�00000012461�12641030331�020021� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import datetime import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urlparse, ) from ..utils import ( parse_iso8601, str_to_int, ) class CamdemyIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' _TESTS = [{ # single file 'url': 'http://www.camdemy.com/media/5181/', 'md5': '5a5562b6a98b37873119102e052e311b', 'info_dict': { 'id': '5181', 'ext': 'mp4', 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', 'thumbnail': 're:^https?://.*\.jpg$', 'description': '', 'creator': 'ss11spring', 'upload_date': '20130114', 'timestamp': 1358154556, 'view_count': int, } }, { # With non-empty description 'url': 'http://www.camdemy.com/media/13885', 'md5': '4576a3bb2581f86c61044822adbd1249', 'info_dict': { 'id': '13885', 'ext': 'mp4', 'title': 'EverCam + Camdemy QuickStart', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'md5:050b62f71ed62928f8a35f1a41e186c9', 'creator': 'evercam', 'upload_date': '20140620', 'timestamp': 1403271569, } }, { # External source 'url': 'http://www.camdemy.com/media/14842', 'md5': '50e1c3c3aa233d3d7b7daa2fa10b1cf7', 'info_dict': { 'id': '2vsYQzNIsJo', 'ext': 'mp4', 'upload_date': '20130211', 'uploader': 'Hun Kim', 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'uploader_id': 'hunkimtutorials', 'title': 'Excel 2013 Tutorial - How to add Password Protection', } }] def _real_extract(self, url): video_id = self._match_id(url) page = self._download_webpage(url, video_id) src_from = self._html_search_regex( r"<div class='srcFrom'>Source: <a title='([^']+)'", page, 'external source', default=None) if src_from: return self.url_result(src_from) oembed_obj = self._download_json( 'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id) thumb_url = oembed_obj['thumbnail_url'] video_folder = compat_urlparse.urljoin(thumb_url, 'video/') file_list_doc = self._download_xml( compat_urlparse.urljoin(video_folder, 'fileList.xml'), video_id, 'Filelist XML') file_name = file_list_doc.find('./video/item/fileName').text video_url = compat_urlparse.urljoin(video_folder, file_name) timestamp = parse_iso8601(self._html_search_regex( r"<div class='title'>Posted\s*:</div>\s*<div class='value'>([^<>]+)<", page, 'creation time', fatal=False), delimiter=' ', timezone=datetime.timedelta(hours=8)) view_count = str_to_int(self._html_search_regex( r"<div class='title'>Views\s*:</div>\s*<div class='value'>([^<>]+)<", page, 'view count', fatal=False)) return { 'id': video_id, 'url': video_url, 'title': oembed_obj['title'], 'thumbnail': thumb_url, 'description': self._html_search_meta('description', page), 'creator': oembed_obj['author_name'], 'duration': oembed_obj['duration'], 'timestamp': timestamp, 'view_count': view_count, } class CamdemyFolderIE(InfoExtractor): _VALID_URL = r'http://www.camdemy.com/folder/(?P<id>\d+)' _TESTS = [{ # links with trailing slash 'url': 'http://www.camdemy.com/folder/450', 'info_dict': { 'id': '450', 'title': '信號與系統 2012 & 2011 (Signals and Systems)', }, 'playlist_mincount': 145 }, { # links without trailing slash # and multi-page 'url': 'http://www.camdemy.com/folder/853', 'info_dict': { 'id': '853', 'title': '科學計算 - 使用 Matlab' }, 'playlist_mincount': 20 }, { # with displayMode parameter. For testing the codes to add parameters 'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg', 'info_dict': { 'id': '853', 'title': '科學計算 - 使用 Matlab' }, 'playlist_mincount': 20 }] def _real_extract(self, url): folder_id = self._match_id(url) # Add displayMode=list so that all links are displayed in a single page parsed_url = list(compat_urlparse.urlparse(url)) query = dict(compat_urlparse.parse_qsl(parsed_url[4])) query.update({'displayMode': 'list'}) parsed_url[4] = compat_urllib_parse.urlencode(query) final_url = compat_urlparse.urlunparse(parsed_url) page = self._download_webpage(final_url, folder_id) matches = re.findall(r"href='(/media/\d+/?)'", page) entries = [self.url_result('http://www.camdemy.com' + media_path) for media_path in matches] folder_title = self._html_search_meta('keywords', page) return self.playlist_result(entries, folder_id, folder_title) ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/thisamericanlife.py�������������������������������������������������0000644�0000000�0000000�00000003015�12641030331�021704� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor class ThisAmericanLifeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?thisamericanlife\.org/(?:radio-archives/episode/|play_full\.php\?play=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.thisamericanlife.org/radio-archives/episode/487/harper-high-school-part-one', 'md5': '8f7d2da8926298fdfca2ee37764c11ce', 'info_dict': { 'id': '487', 'ext': 'm4a', 'title': '487: Harper High School, Part One', 'description': 'md5:ee40bdf3fb96174a9027f76dbecea655', 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://www.thisamericanlife.org/play_full.php?play=487', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://www.thisamericanlife.org/radio-archives/episode/%s' % video_id, video_id) return { 'id': video_id, 'url': 'http://stream.thisamericanlife.org/{0}/stream/{0}_64k.m3u8'.format(video_id), 'protocol': 'm3u8_native', 'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none', 'abr': 64, 'title': self._html_search_meta(r'twitter:title', webpage, 'title', fatal=True), 'description': self._html_search_meta(r'description', webpage, 'description'), 'thumbnail': self._og_search_thumbnail(webpage), } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/chirbit.py����������������������������������������������������������0000644�0000000�0000000�00000005030�12641030331�020020� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( parse_duration, int_or_none, ) class ChirbitIE(InfoExtractor): IE_NAME = 'chirbit' _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)' _TESTS = [{ 'url': 'http://chirb.it/PrIPv5', 'md5': '9847b0dad6ac3e074568bf2cfb197de8', 'info_dict': { 'id': 'PrIPv5', 'ext': 'mp3', 'title': 'Фасадстрой', 'duration': 52, 'view_count': int, 'comment_count': int, } }, { 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', 'only_matching': True, }] def _real_extract(self, url): audio_id = self._match_id(url) webpage = self._download_webpage( 'http://chirb.it/%s' % audio_id, audio_id) audio_url = self._search_regex( r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url') title = self._search_regex( r'itemprop="name">([^<]+)', webpage, 'title') duration = parse_duration(self._html_search_meta( 'duration', webpage, 'duration', fatal=False)) view_count = int_or_none(self._search_regex( r'itemprop="playCount"\s*>(\d+)', webpage, 'listen count', fatal=False)) comment_count = int_or_none(self._search_regex( r'>(\d+) Comments?:', webpage, 'comment count', fatal=False)) return { 'id': audio_id, 'url': audio_url, 'title': title, 'duration': duration, 'view_count': view_count, 'comment_count': comment_count, } class ChirbitProfileIE(InfoExtractor): IE_NAME = 'chirbit:profile' _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P<id>[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', 'info_dict': { 'id': 'ScarletBeauty', 'title': 'Chirbits by ScarletBeauty', }, 'playlist_mincount': 3, } def _real_extract(self, url): profile_id = self._match_id(url) rss = self._download_xml( 'http://chirbit.com/rss/%s' % profile_id, profile_id) entries = [ self.url_result(audio_url.text, 'Chirbit') for audio_url in rss.findall('./channel/item/link')] title = rss.find('./channel/title').text return self.playlist_result(entries, profile_id, title) ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/exfm.py�������������������������������������������������������������0000644�0000000�0000000�00000004012�12660177411�017346� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor class ExfmIE(InfoExtractor): IE_NAME = 'exfm' IE_DESC = 'ex.fm' _VALID_URL = r'http://(?:www\.)?ex\.fm/song/(?P<id>[^/]+)' _SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' _TESTS = [ { 'url': 'http://ex.fm/song/eh359', 'md5': 'e45513df5631e6d760970b14cc0c11e7', 'info_dict': { 'id': '44216187', 'ext': 'mp3', 'title': 'Test House "Love Is Not Enough" (Extended Mix) DeadJournalist Exclusive', 'uploader': 'deadjournalist', 'upload_date': '20120424', 'description': 'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive', }, 'note': 'Soundcloud song', 'skip': 'The site is down too often', }, { 'url': 'http://ex.fm/song/wddt8', 'md5': '966bd70741ac5b8570d8e45bfaed3643', 'info_dict': { 'id': 'wddt8', 'ext': 'mp3', 'title': 'Safe and Sound', 'uploader': 'Capital Cities', }, 'skip': 'The site is down too often', }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) song_id = mobj.group('id') info_url = 'http://ex.fm/api/v3/song/%s' % song_id info = self._download_json(info_url, song_id)['song'] song_url = info['url'] if re.match(self._SOUNDCLOUD_URL, song_url) is not None: self.to_screen('Soundcloud song detected') return self.url_result(song_url.replace('/stream', ''), 'Soundcloud') return { 'id': song_id, 'url': song_url, 'ext': 'mp3', 'title': info['title'], 'thumbnail': info['image']['large'], 'uploader': info['artist'], 'view_count': info['loved_count'], } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/pluralsight.py������������������������������������������������������0000644�0000000�0000000�00000027067�12645665720�020774� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re import json import random import collections from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, compat_urlparse, ) from ..utils import ( ExtractorError, int_or_none, parse_duration, qualities, sanitized_Request, ) class PluralsightBaseIE(InfoExtractor): _API_BASE = 'http://app.pluralsight.com' class PluralsightIE(PluralsightBaseIE): IE_NAME = 'pluralsight' _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/training/player\?' _LOGIN_URL = 'https://app.pluralsight.com/id/' _NETRC_MACHINE = 'pluralsight' _TESTS = [{ 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', 'md5': '4d458cf5cf4c593788672419a8dd4cf8', 'info_dict': { 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', 'ext': 'mp4', 'title': 'Management of SQL Server - Demo Monitoring', 'duration': 338, }, 'skip': 'Requires pluralsight account credentials', }, { 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live', 'only_matching': True, }, { # available without pluralsight account 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', 'only_matching': True, }] def _real_initialize(self): self._login() def _login(self): (username, password) = self._get_login_info() if username is None: return login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') login_form = self._hidden_inputs(login_page) login_form.update({ 'Username': username.encode('utf-8'), 'Password': password.encode('utf-8'), }) post_url = self._search_regex( r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', default=self._LOGIN_URL, group='url') if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) request = sanitized_Request( post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8')) request.add_header('Content-Type', 'application/x-www-form-urlencoded') response = self._download_webpage( request, None, 'Logging in as %s' % username) error = self._search_regex( r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', response, 'error message', default=None) if error: raise ExtractorError('Unable to login: %s' % error, expected=True) if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')): raise ExtractorError('Unable to log in') def _real_extract(self, url): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) author = qs.get('author', [None])[0] name = qs.get('name', [None])[0] clip_id = qs.get('clip', [None])[0] course = qs.get('course', [None])[0] if any(not f for f in (author, name, clip_id, course,)): raise ExtractorError('Invalid URL', expected=True) display_id = '%s-%s' % (name, clip_id) webpage = self._download_webpage(url, display_id) modules = self._search_regex( r'moduleCollection\s*:\s*new\s+ModuleCollection\((\[.+?\])\s*,\s*\$rootScope\)', webpage, 'modules', default=None) if modules: collection = self._parse_json(modules, display_id) else: # Webpage may be served in different layout (see # https://github.com/rg3/youtube-dl/issues/7607) collection = self._parse_json( self._search_regex( r'var\s+initialState\s*=\s*({.+?});\n', webpage, 'initial state'), display_id)['course']['modules'] module, clip = None, None for module_ in collection: if name in (module_.get('moduleName'), module_.get('name')): module = module_ for clip_ in module_.get('clips', []): clip_index = clip_.get('clipIndex') if clip_index is None: clip_index = clip_.get('index') if clip_index is None: continue if compat_str(clip_index) == clip_id: clip = clip_ break if not clip: raise ExtractorError('Unable to resolve clip') QUALITIES = { 'low': {'width': 640, 'height': 480}, 'medium': {'width': 848, 'height': 640}, 'high': {'width': 1024, 'height': 768}, 'high-widescreen': {'width': 1280, 'height': 720}, } QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) quality_key = qualities(QUALITIES_PREFERENCE) AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) ALLOWED_QUALITIES = ( AllowedQuality('webm', ['high', ]), AllowedQuality('mp4', ['low', 'medium', 'high', ]), ) # Some courses also offer widescreen resolution for high quality (see # https://github.com/rg3/youtube-dl/issues/7766) widescreen = True if re.search( r'courseSupportsWidescreenVideoFormats\s*:\s*true', webpage) else False best_quality = 'high-widescreen' if widescreen else 'high' if widescreen: for allowed_quality in ALLOWED_QUALITIES: allowed_quality.qualities.append(best_quality) # In order to minimize the number of calls to ViewClip API and reduce # the probability of being throttled or banned by Pluralsight we will request # only single format until formats listing was explicitly requested. if self._downloader.params.get('listformats', False): allowed_qualities = ALLOWED_QUALITIES else: def guess_allowed_qualities(): req_format = self._downloader.params.get('format') or 'best' req_format_split = req_format.split('-', 1) if len(req_format_split) > 1: req_ext, req_quality = req_format_split for allowed_quality in ALLOWED_QUALITIES: if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: return (AllowedQuality(req_ext, (req_quality, )), ) req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4' return (AllowedQuality(req_ext, (best_quality, )), ) allowed_qualities = guess_allowed_qualities() formats = [] for ext, qualities_ in allowed_qualities: for quality in qualities_: f = QUALITIES[quality].copy() clip_post = { 'a': author, 'cap': 'false', 'cn': clip_id, 'course': course, 'lc': 'en', 'm': name, 'mt': ext, 'q': '%dx%d' % (f['width'], f['height']), } request = sanitized_Request( '%s/training/Player/ViewClip' % self._API_BASE, json.dumps(clip_post).encode('utf-8')) request.add_header('Content-Type', 'application/json;charset=utf-8') format_id = '%s-%s' % (ext, quality) clip_url = self._download_webpage( request, display_id, 'Downloading %s URL' % format_id, fatal=False) # Pluralsight tracks multiple sequential calls to ViewClip API and start # to return 429 HTTP errors after some time (see # https://github.com/rg3/youtube-dl/pull/6989). Moreover it may even lead # to account ban (see https://github.com/rg3/youtube-dl/issues/6842). # To somewhat reduce the probability of these consequences # we will sleep random amount of time before each call to ViewClip. self._sleep( random.randint(2, 5), display_id, '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') if not clip_url: continue f.update({ 'url': clip_url, 'ext': ext, 'format_id': format_id, 'quality': quality_key(quality), }) formats.append(f) self._sort_formats(formats) # TODO: captions # http://www.pluralsight.com/training/Player/ViewClip + cap = true # or # http://www.pluralsight.com/training/Player/Captions # { a = author, cn = clip_id, lc = end, m = name } return { 'id': clip.get('clipName') or clip['name'], 'title': '%s - %s' % (module['title'], clip['title']), 'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')), 'creator': author, 'formats': formats } class PluralsightCourseIE(PluralsightBaseIE): IE_NAME = 'pluralsight:course' _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)' _TESTS = [{ # Free course from Pluralsight Starter Subscription for Microsoft TechNet # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', 'info_dict': { 'id': 'hosting-sql-server-windows-azure-iaas', 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', }, 'playlist_count': 31, }, { # available without pluralsight account 'url': 'https://www.pluralsight.com/courses/angularjs-get-started', 'only_matching': True, }, { 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents', 'only_matching': True, }] def _real_extract(self, url): course_id = self._match_id(url) # TODO: PSM cookie course = self._download_json( '%s/data/course/%s' % (self._API_BASE, course_id), course_id, 'Downloading course JSON') title = course['title'] description = course.get('description') or course.get('shortDescription') course_data = self._download_json( '%s/data/course/content/%s' % (self._API_BASE, course_id), course_id, 'Downloading course data JSON') entries = [] for module in course_data: for clip in module.get('clips', []): player_parameters = clip.get('playerParameters') if not player_parameters: continue entries.append(self.url_result( '%s/training/player?%s' % (self._API_BASE, player_parameters), 'Pluralsight')) return self.playlist_result(entries, course_id, title, description) �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/vidme.py������������������������������������������������������������0000644�0000000�0000000�00000021472�12656352065�017531� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import itertools from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, float_or_none, parse_iso8601, ) class VidmeIE(InfoExtractor): IE_NAME = 'vidme' _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{,5})(?:[^\da-zA-Z]|$)' _TESTS = [{ 'url': 'https://vid.me/QNB', 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', 'info_dict': { 'id': 'QNB', 'ext': 'mp4', 'title': 'Fishing for piranha - the easy way', 'description': 'source: https://www.facebook.com/photo.php?v=312276045600871', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1406313244, 'upload_date': '20140725', 'age_limit': 0, 'duration': 119.92, 'view_count': int, 'like_count': int, 'comment_count': int, }, }, { 'url': 'https://vid.me/Gc6M', 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', 'info_dict': { 'id': 'Gc6M', 'ext': 'mp4', 'title': 'O Mere Dil ke chain - Arnav and Khushi VM', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1441211642, 'upload_date': '20150902', 'uploader': 'SunshineM', 'uploader_id': '3552827', 'age_limit': 0, 'duration': 223.72, 'view_count': int, 'like_count': int, 'comment_count': int, }, 'params': { 'skip_download': True, }, }, { # tests uploader field 'url': 'https://vid.me/4Iib', 'info_dict': { 'id': '4Iib', 'ext': 'mp4', 'title': 'The Carver', 'description': 'md5:e9c24870018ae8113be936645b93ba3c', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1433203629, 'upload_date': '20150602', 'uploader': 'Thomas', 'uploader_id': '109747', 'age_limit': 0, 'duration': 97.859999999999999, 'view_count': int, 'like_count': int, 'comment_count': int, }, 'params': { 'skip_download': True, }, }, { # nsfw test from http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching 'url': 'https://vid.me/e/Wmur', 'info_dict': { 'id': 'Wmur', 'ext': 'mp4', 'title': 'naked smoking & stretching', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1430931613, 'upload_date': '20150506', 'uploader': 'naked-yogi', 'uploader_id': '1638622', 'age_limit': 18, 'duration': 653.26999999999998, 'view_count': int, 'like_count': int, 'comment_count': int, }, 'params': { 'skip_download': True, }, }, { # nsfw, user-disabled 'url': 'https://vid.me/dzGJ', 'only_matching': True, }, { # suspended 'url': 'https://vid.me/Ox3G', 'only_matching': True, }, { # deleted 'url': 'https://vid.me/KTPm', 'only_matching': True, }, { # no formats in the API response 'url': 'https://vid.me/e5g', 'info_dict': { 'id': 'e5g', 'ext': 'mp4', 'title': 'Video upload (e5g)', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1401480195, 'upload_date': '20140530', 'uploader': None, 'uploader_id': None, 'age_limit': 0, 'duration': 483, 'view_count': int, 'like_count': int, 'comment_count': int, }, 'params': { 'skip_download': True, }, }] def _real_extract(self, url): video_id = self._match_id(url) try: response = self._download_json( 'https://api.vid.me/videoByUrl/%s' % video_id, video_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: response = self._parse_json(e.cause.read(), video_id) else: raise error = response.get('error') if error: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, error), expected=True) video = response['video'] if video.get('state') == 'deleted': raise ExtractorError( 'Vidme said: Sorry, this video has been deleted.', expected=True) if video.get('state') in ('user-disabled', 'suspended'): raise ExtractorError( 'Vidme said: This video has been suspended either due to a copyright claim, ' 'or for violating the terms of use.', expected=True) formats = [{ 'format_id': f.get('type'), 'url': f['uri'], 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'preference': 0 if f.get('type', '').endswith('clip') else 1, } for f in video.get('formats', []) if f.get('uri')] if not formats and video.get('complete_url'): formats.append({ 'url': video.get('complete_url'), 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), }) self._sort_formats(formats) title = video['title'] description = video.get('description') thumbnail = video.get('thumbnail_url') timestamp = parse_iso8601(video.get('date_created'), ' ') uploader = video.get('user', {}).get('username') uploader_id = video.get('user', {}).get('user_id') age_limit = 18 if video.get('nsfw') is True else 0 duration = float_or_none(video.get('duration')) view_count = int_or_none(video.get('view_count')) like_count = int_or_none(video.get('likes_count')) comment_count = int_or_none(video.get('comment_count')) return { 'id': video_id, 'title': title or 'Video upload (%s)' % video_id, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, 'uploader_id': uploader_id, 'age_limit': age_limit, 'timestamp': timestamp, 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, 'formats': formats, } class VidmeListBaseIE(InfoExtractor): # Max possible limit according to https://docs.vid.me/#api-Videos-List _LIMIT = 100 def _entries(self, user_id, user_name): for page_num in itertools.count(1): page = self._download_json( 'https://api.vid.me/videos/%s?user=%s&limit=%d&offset=%d' % (self._API_ITEM, user_id, self._LIMIT, (page_num - 1) * self._LIMIT), user_name, 'Downloading user %s page %d' % (self._API_ITEM, page_num)) videos = page.get('videos', []) if not videos: break for video in videos: video_url = video.get('full_url') or video.get('embed_url') if video_url: yield self.url_result(video_url, VidmeIE.ie_key()) total = int_or_none(page.get('page', {}).get('total')) if total and self._LIMIT * page_num >= total: break def _real_extract(self, url): user_name = self._match_id(url) user_id = self._download_json( 'https://api.vid.me/userByUsername?username=%s' % user_name, user_name)['user']['user_id'] return self.playlist_result( self._entries(user_id, user_name), user_id, '%s - %s' % (user_name, self._TITLE)) class VidmeUserIE(VidmeListBaseIE): IE_NAME = 'vidme:user' _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})(?!/likes)(?:[^\da-zA-Z]|$)' _API_ITEM = 'list' _TITLE = 'Videos' _TEST = { 'url': 'https://vid.me/EFARCHIVE', 'info_dict': { 'id': '3834632', 'title': 'EFARCHIVE - %s' % _TITLE, }, 'playlist_mincount': 238, } class VidmeUserLikesIE(VidmeListBaseIE): IE_NAME = 'vidme:user:likes' _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})/likes' _API_ITEM = 'likes' _TITLE = 'Likes' _TEST = { 'url': 'https://vid.me/ErinAlexis/likes', 'info_dict': { 'id': '6483530', 'title': 'ErinAlexis - %s' % _TITLE, }, 'playlist_mincount': 415, } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/sport5.py�����������������������������������������������������������0000644�0000000�0000000�00000006264�12641030331�017642� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ExtractorError class Sport5IE(InfoExtractor): _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' _TESTS = [ { 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', 'info_dict': { 'id': 's5-Y59xx1-GUh2', 'ext': 'mp4', 'title': 'ולנסיה-קורדובה 0:3', 'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה', 'duration': 228, 'categories': list, }, 'skip': 'Blocked outside of Israel', }, { 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', 'info_dict': { 'id': 's5-SiXxx1-hKh2', 'ext': 'mp4', 'title': 'GOALS_CELTIC_270914.mp4', 'description': '', 'duration': 87, 'categories': list, }, 'skip': 'Blocked outside of Israel', } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) media_id = mobj.group('id') webpage = self._download_webpage(url, media_id) video_id = self._html_search_regex('clipId=([\w-]+)', webpage, 'video id') metadata = self._download_xml( 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, video_id) error = metadata.find('./Error') if error is not None: raise ExtractorError( '%s returned error: %s - %s' % ( self.IE_NAME, error.find('./Name').text, error.find('./Description').text), expected=True) title = metadata.find('./Title').text description = metadata.find('./Description').text duration = int(metadata.find('./Duration').text) posters_el = metadata.find('./PosterLinks') thumbnails = [{ 'url': thumbnail.text, 'width': int(thumbnail.get('width')), 'height': int(thumbnail.get('height')), } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else [] categories_el = metadata.find('./Categories') categories = [ cat.get('name') for cat in categories_el.findall('./Category') ] if categories_el is not None else [] formats = [{ 'url': fmt.text, 'ext': 'mp4', 'vbr': int(fmt.get('bitrate')), 'width': int(fmt.get('width')), 'height': int(fmt.get('height')), } for fmt in metadata.findall('./PlaybackLinks/FileURL')] self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'thumbnails': thumbnails, 'duration': duration, 'categories': categories, 'formats': formats, } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/sexykarma.py��������������������������������������������������������0000644�0000000�0000000�00000010515�12641030331�020404� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( unified_strdate, parse_duration, int_or_none, ) class SexyKarmaIE(InfoExtractor): IE_DESC = 'Sexy Karma and Watch Indian Porn' _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' _TESTS = [{ 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', 'info_dict': { 'id': 'yHI70cOyIHt', 'display_id': 'taking-a-quick-pee', 'ext': 'mp4', 'title': 'Taking a quick pee.', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'wildginger7', 'upload_date': '20141008', 'duration': 22, 'view_count': int, 'comment_count': int, 'categories': list, 'age_limit': 18, } }, { 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', 'md5': 'dd216c68d29b49b12842b9babe762a5d', 'info_dict': { 'id': '8Id6EZPbuHf', 'display_id': 'pot-pixie-tribute', 'ext': 'mp4', 'title': 'pot_pixie tribute', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'banffite', 'upload_date': '20141013', 'duration': 16, 'view_count': int, 'comment_count': int, 'categories': list, 'age_limit': 18, } }, { 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', 'md5': '9afb80675550406ed9a63ac2819ef69d', 'info_dict': { 'id': 'dW2mtctxJfs', 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', 'ext': 'mp4', 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Don', 'upload_date': '20140213', 'duration': 83, 'view_count': int, 'comment_count': int, 'categories': list, 'age_limit': 18, } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) video_url = self._html_search_regex( r"url: escape\('([^']+)'\)", webpage, 'url') title = self._html_search_regex( r'<h2 class="he2"><span>(.*?)</span>', webpage, 'title') thumbnail = self._html_search_regex( r'<span id="container"><img\s+src="([^"]+)"', webpage, 'thumbnail', fatal=False) uploader = self._html_search_regex( r'class="aupa">\s*(.*?)</a>', webpage, 'uploader') upload_date = unified_strdate(self._html_search_regex( r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) duration = parse_duration(self._search_regex( r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', webpage, 'duration', fatal=False)) view_count = int_or_none(self._search_regex( r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', webpage, 'view count', fatal=False)) comment_count = int_or_none(self._search_regex( r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', webpage, 'comment count', fatal=False)) categories = re.findall( r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', webpage) return { 'id': video_id, 'display_id': display_id, 'url': video_url, 'title': title, 'thumbnail': thumbnail, 'uploader': uploader, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'comment_count': comment_count, 'categories': categories, 'age_limit': 18, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/karrierevideos.py���������������������������������������������������0000644�0000000�0000000�00000006200�12641030331�021412� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( fix_xml_ampersands, float_or_none, xpath_with_ns, xpath_text, ) class KarriereVideosIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', 'info_dict': { 'id': '32c91', 'ext': 'flv', 'title': 'AltenpflegerIn', 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2', 'thumbnail': 're:^http://.*\.png', }, 'params': { # rtmp download 'skip_download': True, } }, { # broken ampersands 'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun', 'info_dict': { 'id': '5sniu', 'ext': 'flv', 'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"', 'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33', 'thumbnail': 're:^http://.*\.png', }, 'params': { # rtmp download 'skip_download': True, } }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = (self._html_search_meta('title', webpage, default=None) or self._search_regex(r'<h1 class="title">([^<]+)</h1>')) video_id = self._search_regex( r'/config/video/(.+?)\.xml', webpage, 'video id') playlist = self._download_xml( 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id, video_id, transform_source=fix_xml_ampersands) NS_MAP = { 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' } def ns(path): return xpath_with_ns(path, NS_MAP) item = playlist.find('./tracklist/item') video_file = xpath_text( item, ns('./jwplayer:file'), 'video url', fatal=True) streamer = xpath_text( item, ns('./jwplayer:streamer'), 'streamer', fatal=True) uploader = xpath_text( item, ns('./jwplayer:author'), 'uploader') duration = float_or_none( xpath_text(item, ns('./jwplayer:duration'), 'duration')) description = self._html_search_regex( r'(?s)<div class="leadtext">(.+?)</div>', webpage, 'description') thumbnail = self._html_search_meta( 'thumbnail', webpage, 'thumbnail') if thumbnail: thumbnail = compat_urlparse.urljoin(url, thumbnail) return { 'id': video_id, 'url': streamer.replace('rtmpt', 'rtmp'), 'play_path': 'mp4:%s' % video_file, 'ext': 'flv', 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, 'duration': duration, } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/mtv.py��������������������������������������������������������������0000644�0000000�0000000�00000034534�12662061715�017232� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_str, ) from ..utils import ( ExtractorError, find_xpath_attr, fix_xml_ampersands, float_or_none, HEADRequest, sanitized_Request, unescapeHTML, url_basename, RegexNotFoundError, ) def _media_xml_tag(tag): return '{http://search.yahoo.com/mrss/}%s' % tag class MTVServicesInfoExtractor(InfoExtractor): _MOBILE_TEMPLATE = None _LANG = None @staticmethod def _id_from_uri(uri): return uri.split(':')[-1] # This was originally implemented for ComedyCentral, but it also works here @staticmethod def _transform_rtmp_url(rtmp_video_url): m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url) if not m: return rtmp_video_url base = 'http://viacommtvstrmfs.fplive.net/' return base + m.group('finalid') def _get_feed_url(self, uri): return self._FEED_URL def _get_thumbnail_url(self, uri, itemdoc): search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) thumb_node = itemdoc.find(search_path) if thumb_node is None: return None else: return thumb_node.attrib['url'] def _extract_mobile_video_formats(self, mtvn_id): webpage_url = self._MOBILE_TEMPLATE % mtvn_id req = sanitized_Request(webpage_url) # Otherwise we get a webpage that would execute some javascript req.add_header('User-Agent', 'curl/7') webpage = self._download_webpage(req, mtvn_id, 'Downloading mobile page') metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url')) req = HEADRequest(metrics_url) response = self._request_webpage(req, mtvn_id, 'Resolving url') url = response.geturl() # Transform the url to get the best quality: url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1) return [{'url': url, 'ext': 'mp4'}] def _extract_video_formats(self, mdoc, mtvn_id): if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4|copyright_error\.flv(?:\?geo\b.+?)?)$', mdoc.find('.//src').text) is not None: if mtvn_id is not None and self._MOBILE_TEMPLATE is not None: self.to_screen('The normal version is not available from your ' 'country, trying with the mobile version') return self._extract_mobile_video_formats(mtvn_id) raise ExtractorError('This video is not available from your country.', expected=True) formats = [] for rendition in mdoc.findall('.//rendition'): try: _, _, ext = rendition.attrib['type'].partition('/') rtmp_video_url = rendition.find('./src').text if rtmp_video_url.endswith('siteunavail.png'): continue formats.append({ 'ext': ext, 'url': self._transform_rtmp_url(rtmp_video_url), 'format_id': rendition.get('bitrate'), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), }) except (KeyError, TypeError): raise ExtractorError('Invalid rendition field.') self._sort_formats(formats) return formats def _extract_subtitles(self, mdoc, mtvn_id): subtitles = {} for transcript in mdoc.findall('.//transcript'): if transcript.get('kind') != 'captions': continue lang = transcript.get('srclang') subtitles[lang] = [{ 'url': compat_str(typographic.get('src')), 'ext': typographic.get('format') } for typographic in transcript.findall('./typographic')] return subtitles def _get_video_info(self, itemdoc): uri = itemdoc.find('guid').text video_id = self._id_from_uri(uri) self.report_extraction(video_id) content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))) mediagen_url = content_el.attrib['url'] # Remove the templates, like &device={device} mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) if 'acceptMethods' not in mediagen_url: mediagen_url += '&' if '?' in mediagen_url else '?' mediagen_url += 'acceptMethods=fms' mediagen_doc = self._download_xml(mediagen_url, video_id, 'Downloading video urls') item = mediagen_doc.find('./video/item') if item is not None and item.get('type') == 'text': message = '%s returned error: ' % self.IE_NAME if item.get('code') is not None: message += '%s - ' % item.get('code') message += item.text raise ExtractorError(message, expected=True) description_node = itemdoc.find('description') if description_node is not None: description = description_node.text.strip() else: description = None title_el = None if title_el is None: title_el = find_xpath_attr( itemdoc, './/{http://search.yahoo.com/mrss/}category', 'scheme', 'urn:mtvn:video_title') if title_el is None: title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title') if title_el is None: title_el = itemdoc.find('.//title') or itemdoc.find('./title') if title_el.text is None: title_el = None title = title_el.text if title is None: raise ExtractorError('Could not find video title') title = title.strip() # This a short id that's used in the webpage urls mtvn_id = None mtvn_id_node = find_xpath_attr(itemdoc, './/{http://search.yahoo.com/mrss/}category', 'scheme', 'urn:mtvn:id') if mtvn_id_node is not None: mtvn_id = mtvn_id_node.text return { 'title': title, 'formats': self._extract_video_formats(mediagen_doc, mtvn_id), 'subtitles': self._extract_subtitles(mediagen_doc, mtvn_id), 'id': video_id, 'thumbnail': self._get_thumbnail_url(uri, itemdoc), 'description': description, 'duration': float_or_none(content_el.attrib.get('duration')), } def _get_feed_query(self, uri): data = {'uri': uri} if self._LANG: data['lang'] = self._LANG return compat_urllib_parse.urlencode(data) def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) feed_url = self._get_feed_url(uri) info_url = feed_url + '?' + self._get_feed_query(uri) return self._get_videos_info_from_url(info_url, video_id) def _get_videos_info_from_url(self, url, video_id): idoc = self._download_xml( url, video_id, 'Downloading info', transform_source=fix_xml_ampersands) return self.playlist_result( [self._get_video_info(item) for item in idoc.findall('.//item')]) def _extract_mgid(self, webpage): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf # or http://media.mtvnservices.com/{mgid} og_url = self._og_search_video_url(webpage) mgid = url_basename(og_url) if mgid.endswith('.swf'): mgid = mgid[:-4] except RegexNotFoundError: mgid = None if mgid is None or ':' not in mgid: mgid = self._search_regex( [r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'], webpage, 'mgid', default=None) if not mgid: sm4_embed = self._html_search_meta( 'sm4:video:embed', webpage, 'sm4 embed', default='') mgid = self._search_regex( r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid') return mgid def _real_extract(self, url): title = url_basename(url) webpage = self._download_webpage(url, title) mgid = self._extract_mgid(webpage) videos_info = self._get_videos_info(mgid) return videos_info class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): IE_NAME = 'mtvservices:embedded' _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)' _TEST = { # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/ 'url': 'http://media.mtvnservices.com/embed/mgid:uma:video:mtv.com:1043906/cp~vid%3D1043906%26uri%3Dmgid%3Auma%3Avideo%3Amtv.com%3A1043906', 'md5': 'cb349b21a7897164cede95bd7bf3fbb9', 'info_dict': { 'id': '1043906', 'ext': 'mp4', 'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds', 'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.', }, } @staticmethod def _extract_url(webpage): mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage) if mobj: return mobj.group('url') def _get_feed_url(self, uri): video_id = self._id_from_uri(uri) site_id = uri.replace(video_id, '') config_url = ('http://media.mtvnservices.com/pmt/e1/players/{0}/' 'context4/context5/config.xml'.format(site_id)) config_doc = self._download_xml(config_url, video_id) feed_node = config_doc.find('.//feed') feed_url = feed_node.text.strip().split('?')[0] return feed_url def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) mgid = mobj.group('mgid') return self._get_videos_info(mgid) class MTVIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)^https?:// (?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$| m\.mtv\.com/videos/video\.rbml\?.*?id=(?P<mgid>[^&]+))''' _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' _TESTS = [ { 'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', 'md5': '850f3f143316b1e71fa56a4edfd6e0f8', 'info_dict': { 'id': '853555', 'ext': 'mp4', 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', }, }, ] def _get_thumbnail_url(self, uri, itemdoc): return 'http://mtv.mtvnimages.com/uri/' + uri def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') uri = mobj.groupdict().get('mgid') if uri is None: webpage = self._download_webpage(url, video_id) # Some videos come from Vevo.com m_vevo = re.search( r'(?s)isVevoVideo = true;.*?vevoVideoId = "(.*?)";', webpage) if m_vevo: vevo_id = m_vevo.group(1) self.to_screen('Vevo video detected: %s' % vevo_id) return self.url_result('vevo:%s' % vevo_id, ie='Vevo') uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri') return self._get_videos_info(uri) class MTVIggyIE(MTVServicesInfoExtractor): IE_NAME = 'mtviggy.com' _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+' _TEST = { 'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/', 'info_dict': { 'id': '984696', 'ext': 'mp4', 'title': 'Arcade Fire: Behind the Scenes at the Biggest Music Experiment Yet', } } _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' class MTVDEIE(MTVServicesInfoExtractor): IE_NAME = 'mtv.de' _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P<id>\d+)-[^/#?]+/*(?:[#?].*)?$' _TESTS = [{ 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', 'info_dict': { 'id': 'music_video-a50bc5f0b3aa4b3190aa', 'ext': 'mp4', 'title': 'MusicVideo_cro-traum', 'description': 'Cro - Traum', }, 'params': { # rtmp download 'skip_download': True, }, }, { # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', 'info_dict': { 'id': 'local_playlist-f5ae778b9832cc837189', 'ext': 'mp4', 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1', }, 'params': { # rtmp download 'skip_download': True, }, }, { # single video in pagePlaylist with different id 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', 'info_dict': { 'id': 'local_playlist-4e760566473c4c8c5344', 'ext': 'mp4', 'title': 'Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1', 'description': 'MTV Movies Supercut', }, 'params': { # rtmp download 'skip_download': True, }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) playlist = self._parse_json( self._search_regex( r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'), video_id) # news pages contain single video in playlist with different id if len(playlist) == 1: return self._get_videos_info_from_url(playlist[0]['mrss'], video_id) for item in playlist: item_id = item.get('id') if item_id and compat_str(item_id) == video_id: return self._get_videos_info_from_url(item['mrss'], video_id) ��������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/kuwo.py�������������������������������������������������������������0000644�0000000�0000000�00000026531�12654643170�017411� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re import itertools from .common import InfoExtractor from ..utils import ( get_element_by_id, clean_html, ExtractorError, remove_start, ) class KuwoBaseIE(InfoExtractor): _FORMATS = [ {'format': 'ape', 'ext': 'ape', 'preference': 100}, {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80}, {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70}, {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60}, {'format': 'wma', 'ext': 'wma', 'preference': 20}, {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} ] def _get_formats(self, song_id): formats = [] for file_format in self._FORMATS: song_url = self._download_webpage( 'http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url' % (file_format['ext'], file_format.get('br', ''), song_id), song_id, note='Download %s url info' % file_format['format'], ) if song_url == 'IPDeny': raise ExtractorError('This song is blocked in this region', expected=True) if song_url.startswith('http://') or song_url.startswith('https://'): formats.append({ 'url': song_url, 'format_id': file_format['format'], 'format': file_format['format'], 'preference': file_format['preference'], 'abr': file_format.get('abr'), }) self._sort_formats(formats) return formats class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' IE_DESC = '酷我音乐' _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)/' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', 'info_dict': { 'id': '635632', 'ext': 'ape', 'title': '爱我别走', 'creator': '张震岳', 'upload_date': '20080122', 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' }, 'skip': 'this song has been offline because of copyright issues', }, { 'url': 'http://www.kuwo.cn/yinyue/6446136/', 'info_dict': { 'id': '6446136', 'ext': 'mp3', 'title': '心', 'creator': 'IU', 'upload_date': '20150518', }, 'params': { 'format': 'mp3-320' }, }] def _real_extract(self, url): song_id = self._match_id(url) webpage = self._download_webpage( url, song_id, note='Download song detail info', errnote='Unable to get song detail info') if '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: raise ExtractorError('this song has been offline because of copyright issues', expected=True) song_name = self._html_search_regex( r'(?s)class="(?:[^"\s]+\s+)*title(?:\s+[^"\s]+)*".*?<h1[^>]+title="([^"]+)"', webpage, 'song name') singer_name = self._html_search_regex( r'<div[^>]+class="s_img">\s*<a[^>]+title="([^>]+)"', webpage, 'singer name', fatal=False) lrc_content = clean_html(get_element_by_id('lrcContent', webpage)) if lrc_content == '暂无': # indicates no lyrics lrc_content = None formats = self._get_formats(song_id) album_id = self._html_search_regex( r'<p[^>]+class="album"[^<]+<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"', webpage, 'album id', fatal=False) publish_time = None if album_id is not None: album_info_page = self._download_webpage( 'http://www.kuwo.cn/album/%s/' % album_id, song_id, note='Download album detail info', errnote='Unable to get album detail info') publish_time = self._html_search_regex( r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page, 'publish time', fatal=False) if publish_time: publish_time = publish_time.replace('-', '') return { 'id': song_id, 'title': song_name, 'creator': singer_name, 'upload_date': publish_time, 'description': lrc_content, 'formats': formats, } class KuwoAlbumIE(InfoExtractor): IE_NAME = 'kuwo:album' IE_DESC = '酷我音乐 - 专辑' _VALID_URL = r'http://www\.kuwo\.cn/album/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/album/502294/', 'info_dict': { 'id': '502294', 'title': 'M', 'description': 'md5:6a7235a84cc6400ec3b38a7bdaf1d60c', }, 'playlist_count': 2, } def _real_extract(self, url): album_id = self._match_id(url) webpage = self._download_webpage( url, album_id, note='Download album info', errnote='Unable to get album info') album_name = self._html_search_regex( r'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage, 'album name') album_intro = remove_start( clean_html(get_element_by_id('intro', webpage)), '%s简介:' % album_name) entries = [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( r'<p[^>]+class="listen"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"', webpage) ] return self.playlist_result(entries, album_id, album_name, album_intro) class KuwoChartIE(InfoExtractor): IE_NAME = 'kuwo:chart' IE_DESC = '酷我音乐 - 排行榜' _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', 'info_dict': { 'id': '香港中文龙虎榜', 'title': '香港中文龙虎榜', 'description': 're:\d{4}第\d{2}期', }, 'playlist_mincount': 10, } def _real_extract(self, url): chart_id = self._match_id(url) webpage = self._download_webpage( url, chart_id, note='Download chart info', errnote='Unable to get chart info') chart_name = self._html_search_regex( r'<h1[^>]+class="unDis">([^<]+)</h1>', webpage, 'chart name') chart_desc = self._html_search_regex( r'<p[^>]+class="tabDef">(\d{4}第\d{2}期)</p>', webpage, 'chart desc') entries = [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage) ] return self.playlist_result(entries, chart_id, chart_name, chart_desc) class KuwoSingerIE(InfoExtractor): IE_NAME = 'kuwo:singer' IE_DESC = '酷我音乐 - 歌手' _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 'info_dict': { 'id': 'bruno+mars', 'title': 'Bruno Mars', }, 'playlist_count': 10, }, { 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', 'info_dict': { 'id': 'Ali', 'title': 'Ali', }, 'playlist_mincount': 95, 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540 }] def _real_extract(self, url): singer_id = self._match_id(url) webpage = self._download_webpage( url, singer_id, note='Download singer info', errnote='Unable to get singer info') singer_name = self._html_search_regex( r'<div class="title clearfix">\s*<h1>([^<]+)<span', webpage, 'singer name' ) entries = [] first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True for page_num in itertools.count(1): webpage = self._download_webpage( 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), singer_id, note='Download song list page #%d' % page_num, errnote='Unable to get song list page #%d' % page_num) entries.extend([ self.url_result(song_url, 'Kuwo') for song_url in re.findall( r'<p[^>]+class="m_name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/', webpage) ][:10 if first_page_only else None]) if first_page_only or not re.search(r'<a[^>]+href="[^"]+">下一页</a>', webpage): break return self.playlist_result(entries, singer_id, singer_name) class KuwoCategoryIE(InfoExtractor): IE_NAME = 'kuwo:category' IE_DESC = '酷我音乐 - 分类' _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', 'info_dict': { 'id': '86375', 'title': '八十年代精选', 'description': '这些都是属于八十年代的回忆!', }, 'playlist_count': 30, } def _real_extract(self, url): category_id = self._match_id(url) webpage = self._download_webpage( url, category_id, note='Download category info', errnote='Unable to get category info') category_name = self._html_search_regex( r'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage, 'category name') category_desc = remove_start( get_element_by_id('intro', webpage).strip(), '%s简介:' % category_name) jsonm = self._parse_json(self._html_search_regex( r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id) entries = [ self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo') for song in jsonm['musiclist'] ] return self.playlist_result(entries, category_id, category_name, category_desc) class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' IE_DESC = '酷我音乐 - MV' _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { 'id': '6480076', 'ext': 'mkv', 'title': '我们家MV', 'creator': '2PM', }, } _FORMATS = KuwoBaseIE._FORMATS + [ {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, {'format': 'mp4', 'ext': 'mp4', 'preference': 200}, ] def _real_extract(self, url): song_id = self._match_id(url) webpage = self._download_webpage( url, song_id, note='Download mv detail info: %s' % song_id, errnote='Unable to get mv detail info: %s' % song_id) mobj = re.search( r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"', webpage) if mobj: song_name = mobj.group('song') singer_name = mobj.group('singer') else: raise ExtractorError('Unable to find song or singer names') formats = self._get_formats(song_id) return { 'id': song_id, 'title': song_name, 'creator': singer_name, 'formats': formats, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/cspan.py������������������������������������������������������������0000644�0000000�0000000�00000014170�12652732456�017530� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, unescapeHTML, find_xpath_attr, smuggle_url, determine_ext, ExtractorError, ) from .senateisvp import SenateISVPIE class CSpanIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)' IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', 'md5': '94b29a4f131ff03d23471dd6f60b6a1d', 'info_dict': { 'id': '315139', 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b', 'info_dict': { 'id': 'c4486943', 'ext': 'mp4', 'title': 'CSPAN - International Health Care Models', 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', 'md5': '2ae5051559169baadba13fc35345ae74', 'info_dict': { 'id': '342759', 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', 'duration': 14848, 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, }, { # Video from senate.gov 'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers', 'info_dict': { 'id': 'judiciary031715', 'ext': 'flv', 'title': 'Immigration Reforms Needed to Protect Skilled American Workers', } }] def _real_extract(self, url): video_id = self._match_id(url) video_type = None webpage = self._download_webpage(url, video_id) # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) if results: matches = results[0] video_type, video_id = matches.groups() video_type = 'clip' if video_type == 'id' else 'program' else: m = re.search(r'data-(?P<type>clip|prog)id=["\'](?P<id>\d+)', webpage) if m: video_id = m.group('id') video_type = 'program' if m.group('type') == 'prog' else 'clip' else: senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) if senate_isvp_url: title = self._og_search_title(webpage) surl = smuggle_url(senate_isvp_url, {'force_title': title}) return self.url_result(surl, 'SenateISVP', video_id, title) if video_type is None or video_id is None: raise ExtractorError('unable to find video id and type') def get_text_attr(d, attr): return d.get(attr, {}).get('#text') data = self._download_json( 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), video_id)['video'] if data['@status'] != 'Success': raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) doc = self._download_xml( 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), video_id) description = self._html_search_meta('description', webpage) title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text files = data['files'] capfile = get_text_attr(data, 'capfile') entries = [] for partnum, f in enumerate(files): formats = [] for quality in f['qualities']: formats.append({ 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), 'url': unescapeHTML(get_text_attr(quality, 'file')), 'height': int_or_none(get_text_attr(quality, 'height')), 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), }) if not formats: path = unescapeHTML(get_text_attr(f, 'path')) if not path: continue formats = self._extract_m3u8_formats( path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), 'title': ( title if len(files) == 1 else '%s part %d' % (title, partnum + 1)), 'formats': formats, 'description': description, 'thumbnail': thumbnail, 'duration': int_or_none(get_text_attr(f, 'length')), 'subtitles': { 'en': [{ 'url': capfile, 'ext': determine_ext(capfile, 'dfxp') }], } if capfile else None, }) if len(entries) == 1: entry = dict(entries[0]) entry['id'] = 'c' + video_id if video_type == 'clip' else video_id return entry else: return { '_type': 'playlist', 'entries': entries, 'title': title, 'id': 'c' + video_id if video_type == 'clip' else video_id, } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/globo.py������������������������������������������������������������0000644�0000000�0000000�00000051214�12660177411�017517� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import random import math from .common import InfoExtractor from ..compat import ( compat_str, compat_chr, compat_ord, ) from ..utils import ( ExtractorError, float_or_none, int_or_none, str_or_none, ) class GloboIE(InfoExtractor): _VALID_URL = '(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})' _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' _RESIGN_EXPIRATION = 86400 _TESTS = [{ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', 'info_dict': { 'id': '3607726', 'ext': 'mp4', 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', 'duration': 103.204, 'uploader': 'Globo.com', 'uploader_id': '265', }, }, { 'url': 'http://globoplay.globo.com/v/4581987/', 'md5': 'f36a1ecd6a50da1577eee6dd17f67eff', 'info_dict': { 'id': '4581987', 'ext': 'mp4', 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP', 'duration': 137.973, 'uploader': 'Rede Globo', 'uploader_id': '196', }, }, { 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html', 'only_matching': True, }, { 'url': 'http://globosatplay.globo.com/globonews/v/4472924/', 'only_matching': True, }, { 'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/', 'only_matching': True, }, { 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', 'only_matching': True, }, { 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html', 'only_matching': True, }] class MD5(object): HEX_FORMAT_LOWERCASE = 0 HEX_FORMAT_UPPERCASE = 1 BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' BASE64_PAD_CHARACTER_RFC_COMPLIANCE = '=' PADDING = '=0xFF01DD' hexcase = 0 b64pad = '' def __init__(self): pass class JSArray(list): def __getitem__(self, y): try: return list.__getitem__(self, y) except IndexError: return 0 def __setitem__(self, i, y): try: return list.__setitem__(self, i, y) except IndexError: self.extend([0] * (i - len(self) + 1)) self[-1] = y @classmethod def hex_md5(cls, param1): return cls.rstr2hex(cls.rstr_md5(cls.str2rstr_utf8(param1))) @classmethod def b64_md5(cls, param1, param2=None): return cls.rstr2b64(cls.rstr_md5(cls.str2rstr_utf8(param1, param2))) @classmethod def any_md5(cls, param1, param2): return cls.rstr2any(cls.rstr_md5(cls.str2rstr_utf8(param1)), param2) @classmethod def rstr_md5(cls, param1): return cls.binl2rstr(cls.binl_md5(cls.rstr2binl(param1), len(param1) * 8)) @classmethod def rstr2hex(cls, param1): _loc_2 = '0123456789ABCDEF' if cls.hexcase else '0123456789abcdef' _loc_3 = '' for _loc_5 in range(0, len(param1)): _loc_4 = compat_ord(param1[_loc_5]) _loc_3 += _loc_2[_loc_4 >> 4 & 15] + _loc_2[_loc_4 & 15] return _loc_3 @classmethod def rstr2b64(cls, param1): _loc_2 = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' _loc_3 = '' _loc_4 = len(param1) for _loc_5 in range(0, _loc_4, 3): _loc_6_1 = compat_ord(param1[_loc_5]) << 16 _loc_6_2 = compat_ord(param1[_loc_5 + 1]) << 8 if _loc_5 + 1 < _loc_4 else 0 _loc_6_3 = compat_ord(param1[_loc_5 + 2]) if _loc_5 + 2 < _loc_4 else 0 _loc_6 = _loc_6_1 | _loc_6_2 | _loc_6_3 for _loc_7 in range(0, 4): if _loc_5 * 8 + _loc_7 * 6 > len(param1) * 8: _loc_3 += cls.b64pad else: _loc_3 += _loc_2[_loc_6 >> 6 * (3 - _loc_7) & 63] return _loc_3 @staticmethod def rstr2any(param1, param2): _loc_3 = len(param2) _loc_4 = [] _loc_9 = [0] * ((len(param1) >> 2) + 1) for _loc_5 in range(0, len(_loc_9)): _loc_9[_loc_5] = compat_ord(param1[_loc_5 * 2]) << 8 | compat_ord(param1[_loc_5 * 2 + 1]) while len(_loc_9) > 0: _loc_8 = [] _loc_7 = 0 for _loc_5 in range(0, len(_loc_9)): _loc_7 = (_loc_7 << 16) + _loc_9[_loc_5] _loc_6 = math.floor(_loc_7 / _loc_3) _loc_7 -= _loc_6 * _loc_3 if len(_loc_8) > 0 or _loc_6 > 0: _loc_8[len(_loc_8)] = _loc_6 _loc_4[len(_loc_4)] = _loc_7 _loc_9 = _loc_8 _loc_10 = '' _loc_5 = len(_loc_4) - 1 while _loc_5 >= 0: _loc_10 += param2[_loc_4[_loc_5]] _loc_5 -= 1 return _loc_10 @classmethod def str2rstr_utf8(cls, param1, param2=None): _loc_3 = '' _loc_4 = -1 if not param2: param2 = cls.PADDING param1 = param1 + param2[1:9] while True: _loc_4 += 1 if _loc_4 >= len(param1): break _loc_5 = compat_ord(param1[_loc_4]) _loc_6 = compat_ord(param1[_loc_4 + 1]) if _loc_4 + 1 < len(param1) else 0 if 55296 <= _loc_5 <= 56319 and 56320 <= _loc_6 <= 57343: _loc_5 = 65536 + ((_loc_5 & 1023) << 10) + (_loc_6 & 1023) _loc_4 += 1 if _loc_5 <= 127: _loc_3 += compat_chr(_loc_5) continue if _loc_5 <= 2047: _loc_3 += compat_chr(192 | _loc_5 >> 6 & 31) + compat_chr(128 | _loc_5 & 63) continue if _loc_5 <= 65535: _loc_3 += compat_chr(224 | _loc_5 >> 12 & 15) + compat_chr(128 | _loc_5 >> 6 & 63) + compat_chr( 128 | _loc_5 & 63) continue if _loc_5 <= 2097151: _loc_3 += compat_chr(240 | _loc_5 >> 18 & 7) + compat_chr(128 | _loc_5 >> 12 & 63) + compat_chr( 128 | _loc_5 >> 6 & 63) + compat_chr(128 | _loc_5 & 63) return _loc_3 @staticmethod def rstr2binl(param1): _loc_2 = [0] * ((len(param1) >> 2) + 1) for _loc_3 in range(0, len(_loc_2)): _loc_2[_loc_3] = 0 for _loc_3 in range(0, len(param1) * 8, 8): _loc_2[_loc_3 >> 5] |= (compat_ord(param1[_loc_3 // 8]) & 255) << _loc_3 % 32 return _loc_2 @staticmethod def binl2rstr(param1): _loc_2 = '' for _loc_3 in range(0, len(param1) * 32, 8): _loc_2 += compat_chr(param1[_loc_3 >> 5] >> _loc_3 % 32 & 255) return _loc_2 @classmethod def binl_md5(cls, param1, param2): param1 = cls.JSArray(param1) param1[param2 >> 5] |= 128 << param2 % 32 param1[(param2 + 64 >> 9 << 4) + 14] = param2 _loc_3 = 1732584193 _loc_4 = -271733879 _loc_5 = -1732584194 _loc_6 = 271733878 for _loc_7 in range(0, len(param1), 16): _loc_8 = _loc_3 _loc_9 = _loc_4 _loc_10 = _loc_5 _loc_11 = _loc_6 _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 7, -680876936) _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 1], 12, -389564586) _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 17, 606105819) _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 3], 22, -1044525330) _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 7, -176418897) _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 5], 12, 1200080426) _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 17, -1473231341) _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 7], 22, -45705983) _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 7, 1770035416) _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 9], 12, -1958414417) _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 17, -42063) _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 11], 22, -1990404162) _loc_3 = cls.md5_ff(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 7, 1804603682) _loc_6 = cls.md5_ff(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 13], 12, -40341101) _loc_5 = cls.md5_ff(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 17, -1502002290) _loc_4 = cls.md5_ff(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 15], 22, 1236535329) _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 5, -165796510) _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 6], 9, -1069501632) _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 14, 643717713) _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 0], 20, -373897302) _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 5, -701558691) _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 10], 9, 38016083) _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 14, -660478335) _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 4], 20, -405537848) _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 5, 568446438) _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 14], 9, -1019803690) _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 14, -187363961) _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 8], 20, 1163531501) _loc_3 = cls.md5_gg(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 5, -1444681467) _loc_6 = cls.md5_gg(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 2], 9, -51403784) _loc_5 = cls.md5_gg(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 14, 1735328473) _loc_4 = cls.md5_gg(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 12], 20, -1926607734) _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 5], 4, -378558) _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 8], 11, -2022574463) _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 11], 16, 1839030562) _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 14], 23, -35309556) _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 1], 4, -1530992060) _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 4], 11, 1272893353) _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 7], 16, -155497632) _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 10], 23, -1094730640) _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 13], 4, 681279174) _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 0], 11, -358537222) _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 3], 16, -722521979) _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 6], 23, 76029189) _loc_3 = cls.md5_hh(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 9], 4, -640364487) _loc_6 = cls.md5_hh(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 12], 11, -421815835) _loc_5 = cls.md5_hh(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 15], 16, 530742520) _loc_4 = cls.md5_hh(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 2], 23, -995338651) _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 0], 6, -198630844) _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 7], 10, 1126891415) _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 14], 15, -1416354905) _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 5], 21, -57434055) _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 12], 6, 1700485571) _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 3], 10, -1894986606) _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 10], 15, -1051523) _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 1], 21, -2054922799) _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 8], 6, 1873313359) _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 15], 10, -30611744) _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 6], 15, -1560198380) _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 13], 21, 1309151649) _loc_3 = cls.md5_ii(_loc_3, _loc_4, _loc_5, _loc_6, param1[_loc_7 + 4], 6, -145523070) _loc_6 = cls.md5_ii(_loc_6, _loc_3, _loc_4, _loc_5, param1[_loc_7 + 11], 10, -1120210379) _loc_5 = cls.md5_ii(_loc_5, _loc_6, _loc_3, _loc_4, param1[_loc_7 + 2], 15, 718787259) _loc_4 = cls.md5_ii(_loc_4, _loc_5, _loc_6, _loc_3, param1[_loc_7 + 9], 21, -343485551) _loc_3 = cls.safe_add(_loc_3, _loc_8) _loc_4 = cls.safe_add(_loc_4, _loc_9) _loc_5 = cls.safe_add(_loc_5, _loc_10) _loc_6 = cls.safe_add(_loc_6, _loc_11) return [_loc_3, _loc_4, _loc_5, _loc_6] @classmethod def md5_cmn(cls, param1, param2, param3, param4, param5, param6): return cls.safe_add( cls.bit_rol(cls.safe_add(cls.safe_add(param2, param1), cls.safe_add(param4, param6)), param5), param3) @classmethod def md5_ff(cls, param1, param2, param3, param4, param5, param6, param7): return cls.md5_cmn(param2 & param3 | ~param2 & param4, param1, param2, param5, param6, param7) @classmethod def md5_gg(cls, param1, param2, param3, param4, param5, param6, param7): return cls.md5_cmn(param2 & param4 | param3 & ~param4, param1, param2, param5, param6, param7) @classmethod def md5_hh(cls, param1, param2, param3, param4, param5, param6, param7): return cls.md5_cmn(param2 ^ param3 ^ param4, param1, param2, param5, param6, param7) @classmethod def md5_ii(cls, param1, param2, param3, param4, param5, param6, param7): return cls.md5_cmn(param3 ^ (param2 | ~param4), param1, param2, param5, param6, param7) @classmethod def safe_add(cls, param1, param2): _loc_3 = (param1 & 65535) + (param2 & 65535) _loc_4 = (param1 >> 16) + (param2 >> 16) + (_loc_3 >> 16) return cls.lshift(_loc_4, 16) | _loc_3 & 65535 @classmethod def bit_rol(cls, param1, param2): return cls.lshift(param1, param2) | (param1 & 0xFFFFFFFF) >> (32 - param2) @staticmethod def lshift(value, count): r = (0xFFFFFFFF & value) << count return -(~(r - 1) & 0xFFFFFFFF) if r > 0x7FFFFFFF else r def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] title = video['title'] formats = [] for resource in video['resources']: resource_id = resource.get('_id') if not resource_id or resource_id.endswith('manifest'): continue security = self._download_json( self._SECURITY_URL_TEMPLATE % (video_id, resource_id), video_id, 'Downloading security hash for %s' % resource_id) security_hash = security.get('hash') if not security_hash: message = security.get('message') if message: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, message), expected=True) continue hash_code = security_hash[:2] received_time = int(security_hash[2:12]) received_random = security_hash[12:22] received_md5 = security_hash[22:] sign_time = received_time + self._RESIGN_EXPIRATION padding = '%010d' % random.randint(1, 10000000000) signed_md5 = self.MD5.b64_md5(received_md5 + compat_str(sign_time) + padding) signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 resource_url = resource['url'] signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats( signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) else: formats.append({ 'url': signed_url, 'format_id': 'http-%s' % resource_id, 'height': int_or_none(resource.get('height')), }) self._sort_formats(formats) duration = float_or_none(video.get('duration'), 1000) uploader = video.get('channel') uploader_id = str_or_none(video.get('channel_id')) return { 'id': video_id, 'title': title, 'duration': duration, 'uploader': uploader, 'uploader_id': uploader_id, 'formats': formats } class GloboArticleIE(InfoExtractor): _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/]+)\.html' _VIDEOID_REGEXES = [ r'\bdata-video-id=["\'](\d{7,})', r'\bdata-player-videosids=["\'](\d{7,})', r'\bvideosIDs\s*:\s*["\'](\d{7,})', r'\bdata-id=["\'](\d{7,})', r'<div[^>]+\bid=["\'](\d{7,})', ] _TESTS = [{ 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', 'info_dict': { 'id': '3652183', 'ext': 'mp4', 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', 'duration': 110.711, 'uploader': 'Rede Globo', 'uploader_id': '196', } }, { 'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html', 'only_matching': True, }, { 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html', 'only_matching': True, }] @classmethod def suitable(cls, url): return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url) def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') return self.url_result('globo:%s' % video_id, 'Globo') ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/puls4.py������������������������������������������������������������0000644�0000000�0000000�00000006037�12641030331�017453� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# -*- coding: utf-8 -*- from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, unified_strdate, int_or_none, ) class Puls4IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?puls4\.com/video/[^/]+/play/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.puls4.com/video/pro-und-contra/play/2716816', 'md5': '49f6a6629747eeec43cef6a46b5df81d', 'info_dict': { 'id': '2716816', 'ext': 'mp4', 'title': 'Pro und Contra vom 23.02.2015', 'description': 'md5:293e44634d9477a67122489994675db6', 'duration': 2989, 'upload_date': '20150224', 'uploader': 'PULS_4', }, 'skip': 'Only works from Germany', }, { 'url': 'http://www.puls4.com/video/kult-spielfilme/play/1298106', 'md5': '6a48316c8903ece8dab9b9a7bf7a59ec', 'info_dict': { 'id': '1298106', 'ext': 'mp4', 'title': 'Lucky Fritz', }, 'skip': 'Only works from Germany', }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) error_message = self._html_search_regex( r'<div class="message-error">(.+?)</div>', webpage, 'error message', default=None) if error_message: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) real_url = self._html_search_regex( r'\"fsk-button\".+?href=\"([^"]+)', webpage, 'fsk_button', default=None) if real_url: webpage = self._download_webpage(real_url, video_id) player = self._search_regex( r'p4_video_player(?:_iframe)?\("video_\d+_container"\s*,(.+?)\);\s*\}', webpage, 'player') player_json = self._parse_json( '[%s]' % player, video_id, transform_source=lambda s: s.replace('undefined,', '')) formats = None result = None for v in player_json: if isinstance(v, list) and not formats: formats = [{ 'url': f['url'], 'format': 'hd' if f.get('hd') else 'sd', 'width': int_or_none(f.get('size_x')), 'height': int_or_none(f.get('size_y')), 'tbr': int_or_none(f.get('bitrate')), } for f in v] self._sort_formats(formats) elif isinstance(v, dict) and not result: result = { 'id': video_id, 'title': v['videopartname'].strip(), 'description': v.get('videotitle'), 'duration': int_or_none(v.get('videoduration') or v.get('episodeduration')), 'upload_date': unified_strdate(v.get('clipreleasetime')), 'uploader': v.get('channel'), } result['formats'] = formats return result �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/rtlnl.py������������������������������������������������������������0000644�0000000�0000000�00000013005�12641030331�017530� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, parse_duration, ) class RtlNlIE(InfoExtractor): IE_NAME = 'rtl.nl' IE_DESC = 'rtl.nl and rtlxl.nl' _VALID_URL = r'''(?x) https?://(?:www\.)? (?: rtlxl\.nl/\#!/[^/]+/| rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid= ) (?P<id>[0-9a-f-]+)''' _TESTS = [{ 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', 'md5': 'cc16baa36a6c169391f0764fa6b16654', 'info_dict': { 'id': '6e4203a6-0a5e-3596-8424-c599a59e0677', 'ext': 'mp4', 'title': 'RTL Nieuws - Laat', 'description': 'md5:6b61f66510c8889923b11f2778c72dc5', 'timestamp': 1408051800, 'upload_date': '20140814', 'duration': 576.880, }, }, { 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', 'md5': 'dea7474214af1271d91ef332fb8be7ea', 'info_dict': { 'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed', 'ext': 'mp4', 'timestamp': 1424039400, 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', 'upload_date': '20150215', 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } }, { # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275) 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', 'info_dict': { 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', 'ext': 'mp4', 'title': 'RTL Nieuws - Meer beelden van overval juwelier', 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', 'timestamp': 1437233400, 'upload_date': '20150718', 'duration': 30.474, }, 'params': { 'skip_download': True, }, }, { # encrypted m3u8 streams, georestricted 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7', 'only_matching': True, }, { 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', 'only_matching': True, }] def _real_extract(self, url): uuid = self._match_id(url) info = self._download_json( 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid, uuid) material = info['material'][0] title = info['abstracts'][0]['name'] subtitle = material.get('title') if subtitle: title += ' - %s' % subtitle description = material.get('synopsis') meta = info.get('meta', {}) # m3u8 streams are encrypted and may not be handled properly by older ffmpeg/avconv. # To workaround this previously adaptive -> flash trick was used to obtain # unencrypted m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118) # and bypass georestrictions as well. # Currently, unencrypted m3u8 playlists are (intentionally?) invalid and therefore # unusable albeit can be fixed by simple string replacement (see # https://github.com/rg3/youtube-dl/pull/6337) # Since recent ffmpeg and avconv handle encrypted streams just fine encrypted # streams are used now. videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') video_urlpart = videopath.split('/adaptive/')[1][:-5] PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' formats.extend([ { 'url': PG_URL_TEMPLATE % ('a2m', video_urlpart), 'format_id': 'pg-sd', }, { 'url': PG_URL_TEMPLATE % ('a3m', video_urlpart), 'format_id': 'pg-hd', 'quality': 0, } ]) self._sort_formats(formats) thumbnails = [] for p in ('poster_base_url', '"thumb_base_url"'): if not meta.get(p): continue thumbnails.append({ 'url': self._proto_relative_url(meta[p] + uuid), 'width': int_or_none(self._search_regex( r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)), 'height': int_or_none(self._search_regex( r'/sz=[0-9]+x([0-9]+)', meta[p], 'thumbnail height', fatal=False)) }) return { 'id': uuid, 'title': title, 'formats': formats, 'timestamp': material['original_date'], 'description': description, 'duration': parse_duration(material.get('duration')), 'thumbnails': thumbnails, } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/rtbf.py�������������������������������������������������������������0000644�0000000�0000000�00000004414�12641030331�017336� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, unescapeHTML, ) class RTBFIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', 'md5': '799f334ddf2c0a582ba80c44655be570', 'info_dict': { 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', 'duration': 3099, } }, { # geo restricted 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', 'only_matching': True, }, { 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', 'only_matching': True, }] _QUALITIES = [ ('mobile', 'mobile'), ('web', 'SD'), ('url', 'MD'), ('high', 'HD'), ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id) data = self._parse_json( unescapeHTML(self._search_regex( r'data-media="([^"]+)"', webpage, 'data video')), video_id) if data.get('provider').lower() == 'youtube': video_url = data.get('downloadUrl') or data.get('url') return self.url_result(video_url, 'Youtube') formats = [] for key, format_id in self._QUALITIES: format_url = data['sources'].get(key) if format_url: formats.append({ 'format_id': format_id, 'url': format_url, }) return { 'id': video_id, 'formats': formats, 'title': data['title'], 'description': data.get('description') or data.get('subtitle'), 'thumbnail': data.get('thumbnail'), 'duration': data.get('duration') or data.get('realDuration'), 'timestamp': int_or_none(data.get('created')), 'view_count': int_or_none(data.get('viewCount')), } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/discovery.py��������������������������������������������������������0000644�0000000�0000000�00000006052�12641460050�020415� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, ) from ..compat import compat_str class DiscoveryIE(InfoExtractor): _VALID_URL = r'''(?x)http://(?:www\.)?(?: discovery| investigationdiscovery| discoverylife| animalplanet| ahctv| destinationamerica| sciencechannel| tlc| velocity )\.com/(?:[^/]+/)*(?P<id>[^./?#]+)''' _TESTS = [{ 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', 'info_dict': { 'id': '20769', 'ext': 'mp4', 'title': 'Mission Impossible Outtakes', 'description': ('Watch Jamie Hyneman and Adam Savage practice being' ' each other -- to the point of confusing Jamie\'s dog -- and ' 'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s' ' back.'), 'duration': 156, 'timestamp': 1302032462, 'upload_date': '20110405', }, 'params': { 'skip_download': True, # requires ffmpeg } }, { 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons', 'info_dict': { 'id': 'mythbusters-the-simpsons', 'title': 'MythBusters: The Simpsons', }, 'playlist_mincount': 10, }, { 'url': 'http://www.animalplanet.com/longfin-eels-maneaters/', 'info_dict': { 'id': '78326', 'ext': 'mp4', 'title': 'Longfin Eels: Maneaters?', 'description': 'Jeremy Wade tests whether or not New Zealand\'s longfin eels are man-eaters by covering himself in fish guts and getting in the water with them.', 'upload_date': '20140725', 'timestamp': 1406246400, 'duration': 116, }, }] def _real_extract(self, url): display_id = self._match_id(url) info = self._download_json(url + '?flat=1', display_id) video_title = info.get('playlist_title') or info.get('video_title') entries = [{ 'id': compat_str(video_info['id']), 'formats': self._extract_m3u8_formats( video_info['src'], display_id, 'mp4', 'm3u8_native', m3u8_id='hls', note='Download m3u8 information for video %d' % (idx + 1)), 'title': video_info['title'], 'description': video_info.get('description'), 'duration': parse_duration(video_info.get('video_length')), 'webpage_url': video_info.get('href') or video_info.get('url'), 'thumbnail': video_info.get('thumbnailURL'), 'alt_title': video_info.get('secondary_title'), 'timestamp': parse_iso8601(video_info.get('publishedDate')), } for idx, video_info in enumerate(info['playlist'])] return self.playlist_result(entries, display_id, video_title) ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/cloudy.py�����������������������������������������������������������0000644�0000000�0000000�00000006534�12641030331�017705� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urllib_parse, compat_HTTPError, ) from ..utils import ( ExtractorError, HEADRequest, remove_end, ) class CloudyIE(InfoExtractor): _IE_DESC = 'cloudy.ec and videoraj.ch' _VALID_URL = r'''(?x) https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.ch)/ (?:v/|embed\.php\?id=) (?P<id>[A-Za-z0-9]+) ''' _EMBED_URL = 'http://www.%s/embed.php?id=%s' _API_URL = 'http://www.%s/api/player.api.php?%s' _MAX_TRIES = 2 _TESTS = [ { 'url': 'https://www.cloudy.ec/v/af511e2527aac', 'md5': '5cb253ace826a42f35b4740539bedf07', 'info_dict': { 'id': 'af511e2527aac', 'ext': 'flv', 'title': 'Funny Cats and Animals Compilation june 2013', } }, { 'url': 'http://www.videoraj.ch/v/47f399fd8bb60', 'md5': '7d0f8799d91efd4eda26587421c3c3b0', 'info_dict': { 'id': '47f399fd8bb60', 'ext': 'flv', 'title': 'Burning a New iPhone 5 with Gasoline - Will it Survive?', } } ] def _extract_video(self, video_host, video_id, file_key, error_url=None, try_num=0): if try_num > self._MAX_TRIES - 1: raise ExtractorError('Unable to extract video URL', expected=True) form = { 'file': video_id, 'key': file_key, } if error_url: form.update({ 'numOfErrors': try_num, 'errorCode': '404', 'errorUrl': error_url, }) data_url = self._API_URL % (video_host, compat_urllib_parse.urlencode(form)) player_data = self._download_webpage( data_url, video_id, 'Downloading player data') data = compat_parse_qs(player_data) try_num += 1 if 'error' in data: raise ExtractorError( '%s error: %s' % (self.IE_NAME, ' '.join(data['error_msg'])), expected=True) title = data.get('title', [None])[0] if title: title = remove_end(title, '&asdasdas').strip() video_url = data.get('url', [None])[0] if video_url: try: self._request_webpage(HEADRequest(video_url), video_id, 'Checking video URL') except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]: self.report_warning('Invalid video URL, requesting another', video_id) return self._extract_video(video_host, video_id, file_key, video_url, try_num) return { 'id': video_id, 'url': video_url, 'title': title, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_host = mobj.group('host') video_id = mobj.group('id') url = self._EMBED_URL % (video_host, video_id) webpage = self._download_webpage(url, video_id) file_key = self._search_regex( [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'], webpage, 'file_key') return self._extract_video(video_host, video_id, file_key) ��������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/theintercept.py�����������������������������������������������������0000644�0000000�0000000�00000003411�12641030331�021073� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( parse_iso8601, int_or_none, ExtractorError, ) class TheInterceptIE(InfoExtractor): _VALID_URL = r'https://theintercept.com/fieldofvision/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/', 'md5': '145f28b41d44aab2f87c0a4ac8ec95bd', 'info_dict': { 'id': '46214', 'ext': 'mp4', 'title': '#ThisIsACoup – Episode Four: Surrender or Die', 'description': 'md5:74dd27f0e2fbd50817829f97eaa33140', 'timestamp': 1450429239, 'upload_date': '20151218', 'comment_count': int, } }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) json_data = self._parse_json(self._search_regex( r'initialStoreTree\s*=\s*(?P<json_data>{.+})', webpage, 'initialStoreTree'), display_id) for post in json_data['resources']['posts'].values(): if post['slug'] == display_id: return { '_type': 'url_transparent', 'url': 'jwplatform:%s' % post['fov_videoid'], 'id': compat_str(post['ID']), 'display_id': display_id, 'title': post['title'], 'description': post.get('excerpt'), 'timestamp': parse_iso8601(post.get('date')), 'comment_count': int_or_none(post.get('comments_number')), } raise ExtractorError('Unable to find the current post') �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/gazeta.py�����������������������������������������������������������0000644�0000000�0000000�00000002676�12641030331�017664� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class GazetaIE(InfoExtractor): _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)' _TESTS = [{ 'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', 'md5': 'd49c9bdc6e5a7888f27475dc215ee789', 'info_dict': { 'id': '205566', 'ext': 'mp4', 'title': '«70–80 процентов гражданских в Донецке на грани голода»', 'description': 'md5:38617526050bd17b234728e7f9620a71', 'thumbnail': 're:^https?://.*\.jpg', }, }, { 'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') embed_url = '%s?p=embed' % mobj.group('url') embed_page = self._download_webpage( embed_url, display_id, 'Downloading embed page') video_id = self._search_regex( r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id') return self.url_result( 'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform') ������������������������������������������������������������������youtube-dl/youtube_dl/extractor/rottentomatoes.py���������������������������������������������������0000644�0000000�0000000�00000001301�12641030331�021460� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .videodetective import VideoDetectiveIE # It just uses the same method as videodetective.com, # the internetvideoarchive.com is extracted from the og:video property class RottenTomatoesIE(VideoDetectiveIE): _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' _TEST = { 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', 'info_dict': { 'id': '613340', 'ext': 'mp4', 'title': 'TOY STORY 3', 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', }, } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/youjizz.py����������������������������������������������������������0000644�0000000�0000000�00000004371�12660177411�020142� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, ) class YouJizzIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/[^/#?]+-(?P<id>[0-9]+)\.html(?:$|[?#])' _TEST = { 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', 'md5': '07e15fa469ba384c7693fd246905547c', 'info_dict': { 'id': '2189178', 'ext': 'flv', 'title': 'Zeichentrick 1', 'age_limit': 18, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) age_limit = self._rta_search(webpage) video_title = self._html_search_regex( r'<title>\s*(.*)\s*', webpage, 'title') embed_page_url = self._search_regex( r'(https?://www.youjizz.com/videos/embed/[0-9]+)', webpage, 'embed page') webpage = self._download_webpage( embed_page_url, video_id, note='downloading embed page') # Get the video URL m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P.+?)"\);', webpage) if m_playlist is not None: playlist_url = m_playlist.group('playlist') playlist_page = self._download_webpage(playlist_url, video_id, 'Downloading playlist page') m_levels = list(re.finditer(r'[^"]+)"\)\);', webpage, 'video URL') return { 'id': video_id, 'url': video_url, 'title': video_title, 'ext': 'flv', 'format': 'flv', 'player_url': embed_page_url, 'age_limit': age_limit, } youtube-dl/youtube_dl/extractor/bloomberg.py0000644000000000000000000000416312641030331020352 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor class BloombergIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2', # The md5 checksum changes 'info_dict': { 'id': 'qurhIVlJSB6hzkVi229d8g', 'ext': 'flv', 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', 'description': 'md5:a8ba0302912d03d246979735c17d2761', }, }, { 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'only_matching': True, }, { 'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump', 'only_matching': True, }] def _real_extract(self, url): name = self._match_id(url) webpage = self._download_webpage(url, name) video_id = self._search_regex( r'["\']bmmrId["\']\s*:\s*(["\'])(?P.+?)\1', webpage, 'id', group='url') title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) formats = [] for stream in embed_info['streams']: stream_url = stream.get('url') if not stream_url: continue if stream['muxing_format'] == 'TS': formats.extend(self._extract_m3u8_formats( stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.extend(self._extract_f4m_formats( stream_url, video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), } youtube-dl/youtube_dl/extractor/slideshare.py0000644000000000000000000000372312641030331020526 0ustar rootrootfrom __future__ import unicode_literals import re import json from .common import InfoExtractor from ..compat import ( compat_urlparse, ) from ..utils import ( ExtractorError, ) class SlideshareIE(InfoExtractor): _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P.+?)($|\?)' _TEST = { 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', 'info_dict': { 'id': '25665706', 'ext': 'mp4', 'title': 'Managing Scale and Complexity', 'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) slideshare_obj = self._search_regex( r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);', webpage, 'slideshare object') info = json.loads(slideshare_obj) if info['slideshow']['type'] != 'video': raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) doc = info['doc'] bucket = info['jsplayer']['video_bucket'] ext = info['jsplayer']['video_extension'] video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) description = self._html_search_regex( r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage, 'description', fatal=False) return { '_type': 'video', 'id': info['slideshow']['id'], 'title': info['slideshow']['title'], 'ext': ext, 'url': video_url, 'thumbnail': info['slideshow']['pin_image_url'], 'description': description, } ���������������������������������������������youtube-dl/youtube_dl/extractor/collegerama.py������������������������������������������������������0000644�0000000�0000000�00000006312�12660177411�020667� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import json from .common import InfoExtractor from ..utils import ( float_or_none, int_or_none, sanitized_Request, ) class CollegeRamaIE(InfoExtractor): _VALID_URL = r'https?://collegerama\.tudelft\.nl/Mediasite/Play/(?P<id>[\da-f]+)' _TESTS = [ { 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d', 'md5': '481fda1c11f67588c0d9d8fbdced4e39', 'info_dict': { 'id': '585a43626e544bdd97aeb71a0ec907a01d', 'ext': 'mp4', 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', 'description': '', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 7713.088, 'timestamp': 1413309600, 'upload_date': '20141014', }, }, { 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4', 'md5': 'ef1fdded95bdf19b12c5999949419c92', 'info_dict': { 'id': '86a9ea9f53e149079fbdb4202b521ed21d', 'ext': 'wmv', 'title': '64ste Vakantiecursus: Afvalwater', 'description': 'md5:7fd774865cc69d972f542b157c328305', 'duration': 10853, 'timestamp': 1326446400, 'upload_date': '20120113', }, }, ] def _real_extract(self, url): video_id = self._match_id(url) player_options_request = { 'getPlayerOptionsRequest': { 'ResourceId': video_id, 'QueryString': '', } } request = sanitized_Request( 'http://collegerama.tudelft.nl/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', json.dumps(player_options_request)) request.add_header('Content-Type', 'application/json') player_options = self._download_json(request, video_id) presentation = player_options['d']['Presentation'] title = presentation['Title'] description = presentation.get('Description') thumbnail = None duration = float_or_none(presentation.get('Duration'), 1000) timestamp = int_or_none(presentation.get('UnixTime'), 1000) formats = [] for stream in presentation['Streams']: for video in stream['VideoUrls']: thumbnail_url = stream.get('ThumbnailUrl') if thumbnail_url: thumbnail = 'http://collegerama.tudelft.nl' + thumbnail_url format_id = video['MediaType'] if format_id == 'SS': continue formats.append({ 'url': video['Location'], 'format_id': format_id, }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'timestamp': timestamp, 'formats': formats, } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/blinkx.py�����������������������������������������������������������0000644�0000000�0000000�00000006221�12641030331�017666� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import json from .common import InfoExtractor from ..utils import ( remove_start, int_or_none, ) class BlinkxIE(InfoExtractor): _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)' IE_NAME = 'blinkx' _TEST = { 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ', 'md5': '337cf7a344663ec79bf93a526a2e06c7', 'info_dict': { 'id': 'Da0Gw3xc', 'ext': 'mp4', 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News', 'uploader': 'IGN News', 'upload_date': '20150217', 'timestamp': 1424215740, 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.', 'duration': 47.743333, }, } def _real_extract(self, url): video_id = self._match_id(url) display_id = video_id[:8] api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' + 'video=%s' % video_id) data_json = self._download_webpage(api_url, display_id) data = json.loads(data_json)['api']['results'][0] duration = None thumbnails = [] formats = [] for m in data['media']: if m['type'] == 'jpg': thumbnails.append({ 'url': m['link'], 'width': int(m['w']), 'height': int(m['h']), }) elif m['type'] == 'original': duration = float(m['d']) elif m['type'] == 'youtube': yt_id = m['link'] self.to_screen('Youtube video detected: %s' % yt_id) return self.url_result(yt_id, 'Youtube', video_id=yt_id) elif m['type'] in ('flv', 'mp4'): vcodec = remove_start(m['vcodec'], 'ff') acodec = remove_start(m['acodec'], 'ff') vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000) abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000) tbr = vbr + abr if vbr and abr else None format_id = '%s-%sk-%s' % (vcodec, tbr, m['w']) formats.append({ 'format_id': format_id, 'url': m['link'], 'vcodec': vcodec, 'acodec': acodec, 'abr': abr, 'vbr': vbr, 'tbr': tbr, 'width': int_or_none(m.get('w')), 'height': int_or_none(m.get('h')), }) self._sort_formats(formats) return { 'id': display_id, 'fullid': video_id, 'title': data['title'], 'formats': formats, 'uploader': data['channel_name'], 'timestamp': data['pubdate_epoch'], 'description': data.get('description'), 'thumbnails': thumbnails, 'duration': duration, } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/smotri.py�����������������������������������������������������������0000644�0000000�0000000�00000037541�12660177411�017741� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals import re import json import hashlib import uuid from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, unified_strdate, ) class SmotriIE(InfoExtractor): IE_DESC = 'Smotri.com' IE_NAME = 'smotri' _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' _NETRC_MACHINE = 'smotri' _TESTS = [ # real video id 2610366 { 'url': 'http://smotri.com/video/view/?id=v261036632ab', 'md5': '2a7b08249e6f5636557579c368040eb9', 'info_dict': { 'id': 'v261036632ab', 'ext': 'mp4', 'title': 'катастрофа с камер видеонаблюдения', 'uploader': 'rbc2008', 'uploader_id': 'rbc08', 'upload_date': '20131118', 'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', }, }, # real video id 57591 { 'url': 'http://smotri.com/video/view/?id=v57591cb20', 'md5': '830266dfc21f077eac5afd1883091bcd', 'info_dict': { 'id': 'v57591cb20', 'ext': 'flv', 'title': 'test', 'uploader': 'Support Photofile@photofile', 'uploader_id': 'support-photofile', 'upload_date': '20070704', 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', }, }, # video-password, not approved by moderator { 'url': 'http://smotri.com/video/view/?id=v1390466a13c', 'md5': 'f6331cef33cad65a0815ee482a54440b', 'info_dict': { 'id': 'v1390466a13c', 'ext': 'mp4', 'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', 'uploader': 'timoxa40', 'uploader_id': 'timoxa40', 'upload_date': '20100404', 'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', }, 'params': { 'videopassword': 'qwerty', }, 'skip': 'Video is not approved by moderator', }, # video-password { 'url': 'http://smotri.com/video/view/?id=v6984858774#', 'md5': 'f11e01d13ac676370fc3b95b9bda11b0', 'info_dict': { 'id': 'v6984858774', 'ext': 'mp4', 'title': 'Дача Солженицина ПАРОЛЬ 223322', 'uploader': 'psavari1', 'uploader_id': 'psavari1', 'upload_date': '20081103', 'thumbnail': 're:^https?://.*\.jpg$', }, 'params': { 'videopassword': '223322', }, }, # age limit + video-password, not approved by moderator { 'url': 'http://smotri.com/video/view/?id=v15408898bcf', 'md5': '91e909c9f0521adf5ee86fbe073aad70', 'info_dict': { 'id': 'v15408898bcf', 'ext': 'flv', 'title': 'этот ролик не покажут по ТВ', 'uploader': 'zzxxx', 'uploader_id': 'ueggb', 'upload_date': '20101001', 'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', 'age_limit': 18, }, 'params': { 'videopassword': '333' }, 'skip': 'Video is not approved by moderator', }, # age limit + video-password { 'url': 'http://smotri.com/video/view/?id=v7780025814', 'md5': 'b4599b068422559374a59300c5337d72', 'info_dict': { 'id': 'v7780025814', 'ext': 'mp4', 'title': 'Sexy Beach (пароль 123)', 'uploader': 'вАся', 'uploader_id': 'asya_prosto', 'upload_date': '20081218', 'thumbnail': 're:^https?://.*\.jpg$', 'age_limit': 18, }, 'params': { 'videopassword': '123' }, }, # swf player { 'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500', 'md5': '31099eeb4bc906712c5f40092045108d', 'info_dict': { 'id': 'v9188090500', 'ext': 'mp4', 'title': 'Shakira - Don\'t Bother', 'uploader': 'HannahL', 'uploader_id': 'lisaha95', 'upload_date': '20090331', 'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg', }, }, ] @classmethod def _extract_url(cls, webpage): mobj = re.search( r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)', webpage) if mobj is not None: return mobj.group('url') mobj = re.search( r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s* <div\s+class="video_image">[^<]+</div>\s* <div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage) if mobj is not None: return 'http://smotri.com/video/view/?id=%s' % mobj.group('id') def _search_meta(self, name, html, display_name=None): if display_name is None: display_name = name return self._html_search_meta(name, html, display_name) def _real_extract(self, url): video_id = self._match_id(url) video_form = { 'ticket': video_id, 'video_url': '1', 'frame_url': '1', 'devid': 'LoadupFlashPlayer', 'getvideoinfo': '1', } video_password = self._downloader.params.get('videopassword') if video_password: video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() request = sanitized_Request( 'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') video = self._download_json(request, video_id, 'Downloading video JSON') video_url = video.get('_vidURL') or video.get('_vidURL_mp4') if not video_url: if video.get('_moderate_no'): raise ExtractorError( 'Video %s has not been approved by moderator' % video_id, expected=True) if video.get('error'): raise ExtractorError('Video %s does not exist' % video_id, expected=True) if video.get('_pass_protected') == 1: msg = ('Invalid video password' if video_password else 'This video is protected by a password, use the --video-password option') raise ExtractorError(msg, expected=True) title = video['title'] thumbnail = video['_imgURL'] upload_date = unified_strdate(video['added']) uploader = video['userNick'] uploader_id = video['userLogin'] duration = int_or_none(video['duration']) # Video JSON does not provide enough meta data # We will extract some from the video web page instead webpage_url = 'http://smotri.com/video/view/?id=%s' % video_id webpage = self._download_webpage(webpage_url, video_id, 'Downloading video page') # Warning if video is unavailable warning = self._html_search_regex( r'<div class="videoUnModer">(.*?)</div>', webpage, 'warning message', default=None) if warning is not None: self._downloader.report_warning( 'Video %s may not be available; smotri said: %s ' % (video_id, warning)) # Adult content if re.search('EroConfirmText">', webpage) is not None: self.report_age_confirmation() confirm_string = self._html_search_regex( r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, webpage, 'confirm string') confirm_url = webpage_url + '&confirm=%s' % confirm_string webpage = self._download_webpage(confirm_url, video_id, 'Downloading video page (age confirmed)') adult_content = True else: adult_content = False view_count = self._html_search_regex( 'Общее количество просмотров.*?<span class="Number">(\\d+)</span>', webpage, 'view count', fatal=False, flags=re.MULTILINE | re.DOTALL) return { 'id': video_id, 'url': video_url, 'title': title, 'thumbnail': thumbnail, 'uploader': uploader, 'upload_date': upload_date, 'uploader_id': uploader_id, 'duration': duration, 'view_count': int_or_none(view_count), 'age_limit': 18 if adult_content else 0, } class SmotriCommunityIE(InfoExtractor): IE_DESC = 'Smotri.com community videos' IE_NAME = 'smotri:community' _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' _TEST = { 'url': 'http://smotri.com/community/video/kommuna', 'info_dict': { 'id': 'kommuna', 'title': 'КПРФ', }, 'playlist_mincount': 4, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) community_id = mobj.group('communityid') url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id rss = self._download_xml(url, community_id, 'Downloading community RSS') entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] description_text = rss.find('./channel/description').text community_title = self._html_search_regex( '^Видео сообщества "([^"]+)"$', description_text, 'community title') return self.playlist_result(entries, community_id, community_title) class SmotriUserIE(InfoExtractor): IE_DESC = 'Smotri.com user videos' IE_NAME = 'smotri:user' _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' _TESTS = [{ 'url': 'http://smotri.com/user/inspector', 'info_dict': { 'id': 'inspector', 'title': 'Inspector', }, 'playlist_mincount': 9, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('userid') url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id rss = self._download_xml(url, user_id, 'Downloading user RSS') entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] description_text = rss.find('./channel/description').text user_nickname = self._html_search_regex( '^Видео режиссера (.*)$', description_text, 'user nickname') return self.playlist_result(entries, user_id, user_nickname) class SmotriBroadcastIE(InfoExtractor): IE_DESC = 'Smotri.com broadcasts' IE_NAME = 'smotri:broadcast' _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) broadcast_id = mobj.group('broadcastid') broadcast_url = 'http://' + mobj.group('url') broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: raise ExtractorError( 'Broadcast %s does not exist' % broadcast_id, expected=True) # Adult content if re.search('EroConfirmText">', broadcast_page) is not None: (username, password) = self._get_login_info() if username is None: self.raise_login_required('Erotic broadcasts allowed only for registered users') login_form = { 'login-hint53': '1', 'confirm_erotic': '1', 'login': username, 'password': password, } request = sanitized_Request( broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') broadcast_page = self._download_webpage( request, broadcast_id, 'Logging in and confirming age') if re.search('>Неверный логин или пароль<', broadcast_page) is not None: raise ExtractorError('Unable to log in: bad username or password', expected=True) adult_content = True else: adult_content = False ticket = self._html_search_regex( r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'([^']+)'\)", broadcast_page, 'broadcast ticket') url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket broadcast_password = self._downloader.params.get('videopassword') if broadcast_password: url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() broadcast_json_page = self._download_webpage( url, broadcast_id, 'Downloading broadcast JSON') try: broadcast_json = json.loads(broadcast_json_page) protected_broadcast = broadcast_json['_pass_protected'] == 1 if protected_broadcast and not broadcast_password: raise ExtractorError( 'This broadcast is protected by a password, use the --video-password option', expected=True) broadcast_offline = broadcast_json['is_play'] == 0 if broadcast_offline: raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True) rtmp_url = broadcast_json['_server'] mobj = re.search(r'^rtmp://[^/]+/(?P<app>.+)/?$', rtmp_url) if not mobj: raise ExtractorError('Unexpected broadcast rtmp URL') broadcast_playpath = broadcast_json['_streamName'] broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) broadcast_thumbnail = broadcast_json['_imgURL'] broadcast_title = self._live_title(broadcast_json['title']) broadcast_description = broadcast_json['description'] broadcaster_nick = broadcast_json['nick'] broadcaster_login = broadcast_json['login'] rtmp_conn = 'S:%s' % uuid.uuid4().hex except KeyError: if protected_broadcast: raise ExtractorError('Bad broadcast password', expected=True) raise ExtractorError('Unexpected broadcast JSON') return { 'id': broadcast_id, 'url': rtmp_url, 'title': broadcast_title, 'thumbnail': broadcast_thumbnail, 'description': broadcast_description, 'uploader': broadcaster_nick, 'uploader_id': broadcaster_login, 'age_limit': 18 if adult_content else 0, 'ext': 'flv', 'play_path': broadcast_playpath, 'player_url': 'http://pics.smotri.com/broadcast_play.swf', 'app': broadcast_app, 'rtmp_live': True, 'rtmp_conn': rtmp_conn, 'is_live': True, } ���������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/twentymin.py��������������������������������������������������������0000644�0000000�0000000�00000005450�12644050477�020460� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import remove_end class TwentyMinutenIE(InfoExtractor): IE_NAME = '20min' _VALID_URL = r'https?://(?:www\.)?20min\.ch/(?:videotv/*\?.*\bvid=(?P<id>\d+)|(?:[^/]+/)*(?P<display_id>[^/#?]+))' _TESTS = [{ # regular video 'url': 'http://www.20min.ch/videotv/?vid=469148&cid=2', 'md5': 'b52d6bc6ea6398e6a38f12cfd418149c', 'info_dict': { 'id': '469148', 'ext': 'flv', 'title': '85 000 Franken für 15 perfekte Minuten', 'description': 'Was die Besucher vom Silvesterzauber erwarten können. (Video: Alice Grosjean/Murat Temel)', 'thumbnail': 'http://thumbnails.20min-tv.ch/server063/469148/frame-72-469148.jpg' } }, { # news article with video 'url': 'http://www.20min.ch/schweiz/news/story/-Wir-muessen-mutig-nach-vorne-schauen--22050469', 'md5': 'cd4cbb99b94130cff423e967cd275e5e', 'info_dict': { 'id': '469408', 'display_id': '-Wir-muessen-mutig-nach-vorne-schauen--22050469', 'ext': 'flv', 'title': '«Wir müssen mutig nach vorne schauen»', 'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.', 'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg' } }, { 'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738', 'only_matching': True, }, { 'url': 'http://www.20min.ch/ro/sortir/cinema/story/Grandir-au-bahut--c-est-dur-18927411', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id webpage = self._download_webpage(url, display_id) title = self._html_search_regex( r'<h1>.*?<span>(.+?)</span></h1>', webpage, 'title', default=None) if not title: title = remove_end(re.sub( r'^20 [Mm]inuten.*? -', '', self._og_search_title(webpage)), ' - News') if not video_id: video_id = self._search_regex( r'"file\d?"\s*,\s*\"(\d+)', webpage, 'video id') description = self._html_search_meta( 'description', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) return { 'id': video_id, 'display_id': display_id, 'url': 'http://speed.20min-tv.ch/%sm.flv' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/mpora.py������������������������������������������������������������0000644�0000000�0000000�00000004005�12641030331�017513� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..utils import int_or_none class MporaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mpora\.(?:com|de)/videos/(?P<id>[^?#/]+)' IE_NAME = 'MPORA' _TEST = { 'url': 'http://mpora.de/videos/AAdo8okx4wiz/embed?locale=de', 'md5': 'a7a228473eedd3be741397cf452932eb', 'info_dict': { 'id': 'AAdo8okx4wiz', 'ext': 'mp4', 'title': 'Katy Curd - Winter in the Forest', 'duration': 416, 'uploader': 'Peter Newman Media', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) data_json = self._search_regex( [r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;", r"new\s+FM\.Kaltura\.Player\('[^']+'\s*,\s*({.+?})\);"], webpage, 'json') data = self._parse_json(data_json, video_id) uploader = data['info_overlay'].get('username') duration = data['video']['duration'] // 1000 thumbnail = data['video']['encodings']['sd']['poster'] title = data['info_overlay']['title'] formats = [] for encoding_id, edata in data['video']['encodings'].items(): for src in edata['sources']: width_str = self._search_regex( r'_([0-9]+)\.[a-zA-Z0-9]+$', src['src'], False, default=None) vcodec = src['type'].partition('/')[2] formats.append({ 'format_id': encoding_id + '-' + vcodec, 'url': src['src'], 'vcodec': vcodec, 'width': int_or_none(width_str), }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, 'uploader': uploader, 'duration': duration, 'thumbnail': thumbnail, } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/streetvoice.py������������������������������������������������������0000644�0000000�0000000�00000003215�12641030331�020733� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import unified_strdate class StreetVoiceIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://streetvoice.com/skippylu/songs/94440/', 'md5': '15974627fc01a29e492c98593c2fd472', 'info_dict': { 'id': '94440', 'ext': 'mp3', 'filesize': 4167053, 'title': '輸', 'description': 'Crispy脆樂團 - 輸', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 260, 'upload_date': '20091018', 'uploader': 'Crispy脆樂團', 'uploader_id': '627810', } }, { 'url': 'http://tw.streetvoice.com/skippylu/songs/94440/', 'only_matching': True, }] def _real_extract(self, url): song_id = self._match_id(url) song = self._download_json( 'http://streetvoice.com/music/api/song/%s' % song_id, song_id) title = song['name'] author = song['musician']['name'] return { 'id': song_id, 'url': song['file'], 'filesize': song.get('size'), 'title': title, 'description': '%s - %s' % (author, title), 'thumbnail': self._proto_relative_url(song.get('image'), 'http:'), 'duration': song.get('length'), 'upload_date': unified_strdate(song.get('created_at')), 'uploader': author, 'uploader_id': compat_str(song['musician']['id']), } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/pornhd.py�����������������������������������������������������������0000644�0000000�0000000�00000004703�12641030331�017674� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re import json from .common import InfoExtractor from ..utils import ( int_or_none, js_to_json, qualities, ) class PornHdIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' _TEST = { 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'md5': '956b8ca569f7f4d8ec563e2c41598441', 'info_dict': { 'id': '1962', 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'ext': 'mp4', 'title': 'Sierra loves doing laundry', 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', 'thumbnail': 're:^https?://.*\.jpg', 'view_count': int, 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id or video_id) title = self._html_search_regex( [r'<span[^>]+class=["\']video-name["\'][^>]*>([^<]+)', r'<title>(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') description = self._html_search_regex( r'
    ([^<]+)
    ', webpage, 'description', fatal=False) view_count = int_or_none(self._html_search_regex( r'(\d+) views\s*', webpage, 'view count', fatal=False)) thumbnail = self._search_regex( r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) quality = qualities(['sd', 'hd']) sources = json.loads(js_to_json(self._search_regex( r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]", webpage, 'sources'))) formats = [] for qname, video_url in sources.items(): if not video_url: continue formats.append({ 'url': video_url, 'format_id': qname, 'quality': quality(qname), }) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'view_count': view_count, 'formats': formats, 'age_limit': 18, } youtube-dl/youtube_dl/extractor/nerdcubed.py0000644000000000000000000000215312660177411020346 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import datetime from .common import InfoExtractor class NerdCubedFeedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/feed\.json' _TEST = { 'url': 'http://www.nerdcubed.co.uk/feed.json', 'info_dict': { 'id': 'nerdcubed-feed', 'title': 'nerdcubed.co.uk feed', }, 'playlist_mincount': 1300, } def _real_extract(self, url): feed = self._download_json(url, url, 'Downloading NerdCubed JSON feed') entries = [{ '_type': 'url', 'title': feed_entry['title'], 'uploader': feed_entry['source']['name'] if feed_entry['source'] else None, 'upload_date': datetime.datetime.strptime(feed_entry['date'], '%Y-%m-%d').strftime('%Y%m%d'), 'url': 'http://www.youtube.com/watch?v=' + feed_entry['youtube_id'], } for feed_entry in feed] return { '_type': 'playlist', 'title': 'nerdcubed.co.uk feed', 'id': 'nerdcubed-feed', 'entries': entries, } youtube-dl/youtube_dl/extractor/veehd.py0000644000000000000000000001004112641030331017465 0ustar rootrootfrom __future__ import unicode_literals import re import json from .common import InfoExtractor from ..compat import ( compat_urllib_parse_unquote, compat_urlparse, ) from ..utils import ( ExtractorError, clean_html, get_element_by_id, ) class VeeHDIE(InfoExtractor): _VALID_URL = r'https?://veehd\.com/video/(?P\d+)' # Seems VeeHD videos have multiple copies on several servers, all of # whom have different MD5 checksums, so omit md5 field in all tests _TESTS = [{ 'url': 'http://veehd.com/video/4639434_Solar-Sinter', 'info_dict': { 'id': '4639434', 'ext': 'mp4', 'title': 'Solar Sinter', 'uploader_id': 'VideoEyes', 'description': 'md5:46a840e8692ddbaffb5f81d9885cb457', }, 'skip': 'Video deleted', }, { 'url': 'http://veehd.com/video/4905758_Elysian-Fields-Channeling', 'info_dict': { 'id': '4905758', 'ext': 'mp4', 'title': 'Elysian Fields - Channeling', 'description': 'md5:360e4e95fdab58aefbea0f2a19e5604b', 'uploader_id': 'spotted', } }, { 'url': 'http://veehd.com/video/2046729_2012-2009-DivX-Trailer', 'info_dict': { 'id': '2046729', 'ext': 'avi', 'title': '2012 (2009) DivX Trailer', 'description': 'md5:75435ee95255e6a9838ac6f6f3a2396b', 'uploader_id': 'Movie_Trailers', } }] def _real_extract(self, url): video_id = self._match_id(url) # VeeHD seems to send garbage on the first request. # See https://github.com/rg3/youtube-dl/issues/2102 self._download_webpage(url, video_id, 'Requesting webpage') webpage = self._download_webpage(url, video_id) if 'This video has been removed<' in webpage: raise ExtractorError('Video %s has been removed' % video_id, expected=True) player_path = self._search_regex( r'\$\("#playeriframe"\).attr\({src : "(.+?)"', webpage, 'player path') player_url = compat_urlparse.urljoin(url, player_path) self._download_webpage(player_url, video_id, 'Requesting player page') player_page = self._download_webpage( player_url, video_id, 'Downloading player page') video_url = None config_json = self._search_regex( r'value=\'config=({.+?})\'', player_page, 'config json', default=None) if config_json: config = json.loads(config_json) video_url = compat_urllib_parse_unquote(config['clip']['url']) if not video_url: video_url = self._html_search_regex( r']+type="video/divx"[^>]+src="([^"]+)"', player_page, 'video url', default=None) if not video_url: iframe_src = self._search_regex( r']+src="/?([^"]+)"', player_page, 'iframe url') iframe_url = 'http://veehd.com/%s' % iframe_src self._download_webpage(iframe_url, video_id, 'Requesting iframe page') iframe_page = self._download_webpage( iframe_url, video_id, 'Downloading iframe page') video_url = self._search_regex( r"file\s*:\s*'([^']+)'", iframe_page, 'video url') title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0]) uploader_id = self._html_search_regex( r'
    (.+?)', webpage, 'uploader') thumbnail = self._search_regex( r'(.*?)]+data-guid="([^"]+)"', webpage, 'guid') feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name) content = feed.find('.//{http://search.yahoo.com/mrss/}content') theplatform_id = url_basename(content.attrib.get('url')) return self.url_result(smuggle_url( 'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id, # For some reason, the normal links don't work and we must force # the use of f4m {'force_smil_url': True})) youtube-dl/youtube_dl/extractor/canalplus.py0000644000000000000000000001276012641247326020404 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, HEADRequest, unified_strdate, url_basename, qualities, int_or_none, ) class CanalplusIE(InfoExtractor): IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv' _VALID_URL = r'https?://(?:www\.(?Pcanalplus\.fr|piwiplus\.fr|d8\.tv|itele\.fr)/.*?/(?P.*)|player\.canalplus\.fr/#/(?P[0-9]+))' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' _SITE_ID_MAP = { 'canalplus.fr': 'cplus', 'piwiplus.fr': 'teletoon', 'd8.tv': 'd8', 'itele.fr': 'itele', } _TESTS = [{ 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092', 'md5': '12164a6f14ff6df8bd628e8ba9b10b78', 'info_dict': { 'id': '1263092', 'ext': 'mp4', 'title': 'Le Zapping - 13/05/15', 'description': 'md5:09738c0d06be4b5d06a0940edb0da73f', 'upload_date': '20150513', }, }, { 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', 'info_dict': { 'id': '1108190', 'ext': 'flv', 'title': 'Le labyrinthe - Boing super ranger', 'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff', 'upload_date': '20140724', }, 'skip': 'Only works from France', }, { 'url': 'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html', 'info_dict': { 'id': '966289', 'ext': 'flv', 'title': 'Campagne intime - Documentaire exceptionnel', 'description': 'md5:d2643b799fb190846ae09c61e59a859f', 'upload_date': '20131108', }, 'skip': 'videos get deleted after a while', }, { 'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559', 'md5': '38b8f7934def74f0d6f3ba6c036a5f82', 'info_dict': { 'id': '1213714', 'ext': 'mp4', 'title': 'Aubervilliers : un lycée en colère - Le 11/02/2015 à 06h45', 'description': 'md5:8216206ec53426ea6321321f3b3c16db', 'upload_date': '20150211', }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.groupdict().get('id') site_id = self._SITE_ID_MAP[mobj.group('site') or 'canal'] # Beware, some subclasses do not define an id group display_id = url_basename(mobj.group('path')) if video_id is None: webpage = self._download_webpage(url, display_id) video_id = self._search_regex( [r']+?videoId=(["\'])(?P\d+)', r'id=["\']canal_video_player(?P\d+)'], webpage, 'video id', group='id') info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) video_data = self._download_json(info_url, video_id, 'Downloading video JSON') if isinstance(video_data, list): video_data = [video for video in video_data if video.get('ID') == video_id][0] media = video_data['MEDIA'] infos = video_data['INFOS'] preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD']) fmt_url = next(iter(media.get('VIDEOS'))) if '/geo' in fmt_url.lower(): response = self._request_webpage( HEADRequest(fmt_url), video_id, 'Checking if the video is georestricted') if '/blocage' in response.geturl(): raise ExtractorError( 'The video is not available in your country', expected=True) formats = [] for format_id, format_url in media['VIDEOS'].items(): if not format_url: continue if format_id == 'HLS': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) elif format_id == 'HDS': formats.extend(self._extract_f4m_formats( format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) else: formats.append({ # the secret extracted ya function in http://player.canalplus.fr/common/js/canalPlayer.js 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', 'format_id': format_id, 'preference': preference(format_id), }) self._sort_formats(formats) thumbnails = [{ 'id': image_id, 'url': image_url, } for image_id, image_url in media.get('images', {}).items()] titrage = infos['TITRAGE'] return { 'id': video_id, 'display_id': display_id, 'title': '%s - %s' % (titrage['TITRE'], titrage['SOUS_TITRE']), 'upload_date': unified_strdate(infos.get('PUBLICATION', {}).get('DATE')), 'thumbnails': thumbnails, 'description': infos.get('DESCRIPTION'), 'duration': int_or_none(infos.get('DURATION')), 'view_count': int_or_none(infos.get('NB_VUES')), 'like_count': int_or_none(infos.get('NB_LIKES')), 'comment_count': int_or_none(infos.get('NB_COMMENTS')), 'formats': formats, } youtube-dl/youtube_dl/extractor/imdb.py0000644000000000000000000000650312641030331017315 0ustar rootrootfrom __future__ import unicode_literals import re import json from .common import InfoExtractor from ..utils import ( qualities, ) class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P\d+)' _TEST = { 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', 'ext': 'mp4', 'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb', 'description': 'md5:9061c2219254e5d14e03c25c98e96a81', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id) descr = self._html_search_regex( r'(?s)(.*?)', webpage, 'description', fatal=False) player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id player_page = self._download_webpage( player_url, video_id, 'Downloading player page') # the player page contains the info for the default format, we have to # fetch other pages for the rest of the formats extra_formats = re.findall(r'href="(?P%s.*?)".*?>(?P.*?)<' % re.escape(player_url), player_page) format_pages = [ self._download_webpage( f_url, video_id, 'Downloading info for %s format' % f_name) for f_url, f_name in extra_formats] format_pages.append(player_page) quality = qualities(['SD', '480p', '720p']) formats = [] for format_page in format_pages: json_data = self._search_regex( r']+class="imdb-player-data"[^>]*?>(.*?)', format_page, 'json data', flags=re.DOTALL) info = json.loads(json_data) format_info = info['videoPlayerObject']['video'] f_id = format_info['ffname'] formats.append({ 'format_id': f_id, 'url': format_info['videoInfoList'][0]['videoUrl'], 'quality': quality(f_id), }) self._sort_formats(formats) return { 'id': video_id, 'title': self._og_search_title(webpage), 'formats': formats, 'description': descr, 'thumbnail': format_info['slate'], } class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' _VALID_URL = r'http://www\.imdb\.com/list/(?P[\da-zA-Z_-]{11})' _TEST = { 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', 'info_dict': { 'id': 'JFs9NWw6XI0', 'title': 'March 23, 2012 Releases', }, 'playlist_count': 7, } def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) entries = [ self.url_result('http://www.imdb.com' + m, 'Imdb') for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)] list_title = self._html_search_regex( r'

    (.*?)

    ', webpage, 'list title') return self.playlist_result(entries, list_id, list_title) youtube-dl/youtube_dl/extractor/myvidster.py0000644000000000000000000000162712641030331020432 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor class MyVidsterIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?myvidster\.com/video/(?P\d+)/' _TEST = { 'url': 'http://www.myvidster.com/video/32059805/Hot_chemistry_with_raw_love_making', 'md5': '95296d0231c1363222c3441af62dc4ca', 'info_dict': { 'id': '3685814', 'title': 'md5:7d8427d6d02c4fbcef50fe269980c749', 'upload_date': '20141027', 'uploader_id': 'utkualp', 'ext': 'mp4', 'age_limit': 18, }, 'add_ie': ['XHamster'], } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) return self.url_result(self._html_search_regex( r'rel="videolink" href="(?P.*)">', webpage, 'real video url')) youtube-dl/youtube_dl/extractor/vesti.py0000644000000000000000000001043712641030331017535 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ExtractorError from .rutv import RUTVIE class VestiIE(InfoExtractor): IE_DESC = 'Вести.Ru' _VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P.+)' _TESTS = [ { 'url': 'http://www.vesti.ru/videos?vid=575582&cid=1', 'info_dict': { 'id': '765035', 'ext': 'mp4', 'title': 'Вести.net: биткоины в России не являются законными', 'description': 'md5:d4bb3859dc1177b28a94c5014c35a36b', 'duration': 302, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://www.vesti.ru/doc.html?id=1349233', 'info_dict': { 'id': '773865', 'ext': 'mp4', 'title': 'Участники митинга штурмуют Донецкую областную администрацию', 'description': 'md5:1a160e98b3195379b4c849f2f4958009', 'duration': 210, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://www.vesti.ru/only_video.html?vid=576180', 'info_dict': { 'id': '766048', 'ext': 'mp4', 'title': 'США заморозило, Британию затопило', 'description': 'md5:f0ed0695ec05aed27c56a70a58dc4cc1', 'duration': 87, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://hitech.vesti.ru/news/view/id/4000', 'info_dict': { 'id': '766888', 'ext': 'mp4', 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"', 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995', 'duration': 279, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403', 'info_dict': { 'id': '766403', 'ext': 'mp4', 'title': 'XXII зимние Олимпийские игры. Российские хоккеисты стартовали на Олимпиаде с победы', 'description': 'md5:55805dfd35763a890ff50fa9e35e31b3', 'duration': 271, }, 'params': { # m3u8 download 'skip_download': True, }, 'skip': 'Blocked outside Russia', }, { 'url': 'http://sochi2014.vesti.ru/live/play/live_id/301', 'info_dict': { 'id': '51499', 'ext': 'flv', 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ', 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c', }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'Translation has finished' }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') page = self._download_webpage(url, video_id, 'Downloading page') mobj = re.search( r']+?property="og:video"[^>]+?content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P\d+)', page) if mobj: video_id = mobj.group('id') page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id, 'Downloading video page') rutv_url = RUTVIE._extract_url(page) if rutv_url: return self.url_result(rutv_url, 'RUTV') raise ExtractorError('No video found', expected=True) youtube-dl/youtube_dl/extractor/scivee.py0000644000000000000000000000354612641030331017664 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import int_or_none class SciVeeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?scivee\.tv/node/(?P\d+)' _TEST = { 'url': 'http://www.scivee.tv/node/62352', 'md5': 'b16699b74c9e6a120f6772a44960304f', 'info_dict': { 'id': '62352', 'ext': 'mp4', 'title': 'Adam Arkin at the 2014 DOE JGI Genomics of Energy & Environment Meeting', 'description': 'md5:81f1710638e11a481358fab1b11059d7', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') # annotations XML is malformed annotations = self._download_webpage( 'http://www.scivee.tv/assets/annotations/%s' % video_id, video_id, 'Downloading annotations') title = self._html_search_regex(r'([^<]+)', annotations, 'title') description = self._html_search_regex(r'([^<]+)', annotations, 'abstract', fatal=False) filesize = int_or_none(self._html_search_regex( r'([^<]+)', annotations, 'filesize', fatal=False)) formats = [ { 'url': 'http://www.scivee.tv/assets/audio/%s' % video_id, 'ext': 'mp3', 'format_id': 'audio', }, { 'url': 'http://www.scivee.tv/assets/video/%s' % video_id, 'ext': 'mp4', 'format_id': 'video', 'filesize': filesize, }, ] return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': 'http://www.scivee.tv/assets/videothumb/%s' % video_id, 'formats': formats, } youtube-dl/youtube_dl/extractor/vuclip.py0000644000000000000000000000522412641030331017703 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( ExtractorError, parse_duration, qualities, ) class VuClipIE(InfoExtractor): _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P[0-9]+)' _TEST = { 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', 'info_dict': { 'id': '922692425', 'ext': '3gp', 'title': 'The Toy Soldiers - Hollywood Movie Trailer', 'duration': 180, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) ad_m = re.search( r'''value="No.*?" onClick="location.href='([^"']+)'"''', webpage) if ad_m: urlr = compat_urllib_parse_urlparse(url) adfree_url = urlr.scheme + '://' + urlr.netloc + ad_m.group(1) webpage = self._download_webpage( adfree_url, video_id, note='Download post-ad page') error_msg = self._html_search_regex( r'

    (.*?)

    ', webpage, 'error message', default=None) if error_msg: raise ExtractorError( '%s said: %s' % (self.IE_NAME, error_msg), expected=True) # These clowns alternate between two page types links_code = self._search_regex( r'''(?xs) (?: | \s*
    ) (.*?) (?: ) ''', webpage, 'links') title = self._html_search_regex( r'(.*?)-\s*Vuclip', webpage, 'title').strip() quality_order = qualities(['Reg', 'Hi']) formats = [] for url, q in re.findall( r'[^"]+)".*?>(?:]*>)?(?P[^<]+)(?:)?', links_code): format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q formats.append({ 'format_id': format_id, 'url': url, 'quality': quality_order(q), }) self._sort_formats(formats) duration = parse_duration(self._search_regex( r'\(([0-9:]+)\)', webpage, 'duration', fatal=False)) return { 'id': video_id, 'formats': formats, 'title': title, 'duration': duration, } youtube-dl/youtube_dl/extractor/tnaflix.py0000644000000000000000000002405012641030331020044 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( fix_xml_ampersands, float_or_none, int_or_none, parse_duration, str_to_int, xpath_text, ) class TNAFlixNetworkBaseIE(InfoExtractor): # May be overridden in descendants if necessary _CONFIG_REGEX = [ r'flashvars\.config\s*=\s*escape\("([^"]+)"', r']+name="config\d?" value="([^"]+)"', ] _TITLE_REGEX = r']+name="title" value="([^"]+)"' _DESCRIPTION_REGEX = r']+name="description" value="([^"]+)"' _UPLOADER_REGEX = r']+name="username" value="([^"]+)"' _VIEW_COUNT_REGEX = None _COMMENT_COUNT_REGEX = None _AVERAGE_RATING_REGEX = None _CATEGORIES_REGEX = r']*>\s*]+class="infoTitle"[^>]*>Categories:\s*]+class="listView"[^>]*>(.+?)\s*' def _extract_thumbnails(self, flix_xml): def get_child(elem, names): for name in names: child = elem.find(name) if child is not None: return child timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage']) if timeline is None: return pattern_el = get_child(timeline, ['imagePattern', 'pattern']) if pattern_el is None or not pattern_el.text: return first_el = get_child(timeline, ['imageFirst', 'first']) last_el = get_child(timeline, ['imageLast', 'last']) if first_el is None or last_el is None: return first_text = first_el.text last_text = last_el.text if not first_text.isdigit() or not last_text.isdigit(): return first = int(first_text) last = int(last_text) if first > last: return width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width')) height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height')) return [{ 'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'), 'width': width, 'height': height, } for i in range(first, last + 1)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) cfg_url = self._proto_relative_url(self._html_search_regex( self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:') cfg_xml = self._download_xml( cfg_url, display_id, 'Downloading metadata', transform_source=fix_xml_ampersands) formats = [] def extract_video_url(vl): return re.sub('speed=\d+', 'speed=', vl.text) video_link = cfg_xml.find('./videoLink') if video_link is not None: formats.append({ 'url': extract_video_url(video_link), 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), }) for item in cfg_xml.findall('./quality/item'): video_link = item.find('./videoLink') if video_link is None: continue res = item.find('res') format_id = None if res is None else res.text height = int_or_none(self._search_regex( r'^(\d+)[pP]', format_id, 'height', default=None)) formats.append({ 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'), 'format_id': format_id, 'height': height, }) self._sort_formats(formats) thumbnail = self._proto_relative_url( xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') thumbnails = self._extract_thumbnails(cfg_xml) title = self._html_search_regex( self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) age_limit = self._rta_search(webpage) duration = parse_duration(self._html_search_meta( 'duration', webpage, 'duration', default=None)) def extract_field(pattern, name): return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None description = extract_field(self._DESCRIPTION_REGEX, 'description') uploader = extract_field(self._UPLOADER_REGEX, 'uploader') view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')) comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')) average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')) categories_str = extract_field(self._CATEGORIES_REGEX, 'categories') categories = categories_str.split(', ') if categories_str is not None else [] return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'thumbnails': thumbnails, 'duration': duration, 'age_limit': age_limit, 'uploader': uploader, 'view_count': view_count, 'comment_count': comment_count, 'average_rating': average_rating, 'categories': categories, 'formats': formats, } class TNAFlixIE(TNAFlixNetworkBaseIE): _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P[^/]+)/video(?P\d+)' _TITLE_REGEX = r'(.+?) - TNAFlix Porn Videos' _DESCRIPTION_REGEX = r'

    ([^<]+)

    ' _UPLOADER_REGEX = r'(?s)]+class="infoTitle"[^>]*>Uploaded By:(.+?).+?)-(?P[0-9]+)\.html' _UPLOADER_REGEX = r']+class="infoTitle"[^>]*>Uploaded By:(.+?)' _TESTS = [{ 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', 'md5': 'b1bc15b6412d33902d6e5952035fcabc', 'info_dict': { 'id': '33051', 'display_id': 'Amateur-Finger-Fuck', 'ext': 'mp4', 'title': 'Amateur Finger Fuck', 'description': 'Amateur solo finger fucking.', 'thumbnail': 're:https?://.*\.jpg$', 'duration': 83, 'age_limit': 18, 'uploader': 'cwbike', 'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'], } }, { 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', 'only_matching': True, }] class MovieFapIE(TNAFlixNetworkBaseIE): _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P[0-9a-f]+)/(?P[^/]+)\.html' _VIEW_COUNT_REGEX = r'
    Views\s*([\d,.]+)' _COMMENT_COUNT_REGEX = r']+id="comCount"[^>]*>([\d,.]+)' _AVERAGE_RATING_REGEX = r'Current Rating\s*
    \s*([\d.]+)' _CATEGORIES_REGEX = r'(?s)]+id="vid_info"[^>]*>\s*]*>.+?(.*?)
    ' _TESTS = [{ # normal, multi-format video 'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html', 'md5': '26624b4e2523051b550067d547615906', 'info_dict': { 'id': 'be9867c9416c19f54a4a', 'display_id': 'experienced-milf-amazing-handjob', 'ext': 'mp4', 'title': 'Experienced MILF Amazing Handjob', 'description': 'Experienced MILF giving an Amazing Handjob', 'thumbnail': 're:https?://.*\.jpg$', 'age_limit': 18, 'uploader': 'darvinfred06', 'view_count': int, 'comment_count': int, 'average_rating': float, 'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'], } }, { # quirky single-format case where the extension is given as fid, but the video is really an flv 'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html', 'md5': 'fa56683e291fc80635907168a743c9ad', 'info_dict': { 'id': 'e5da0d3edce5404418f5', 'display_id': 'jeune-couple-russe', 'ext': 'flv', 'title': 'Jeune Couple Russe', 'description': 'Amateur', 'thumbnail': 're:https?://.*\.jpg$', 'age_limit': 18, 'uploader': 'whiskeyjar', 'view_count': int, 'comment_count': int, 'average_rating': float, 'categories': ['Amateur', 'Teen'], } }] youtube-dl/youtube_dl/extractor/deezer.py0000644000000000000000000000620212641030331017654 0ustar rootrootfrom __future__ import unicode_literals import json import re from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, orderedSet, ) class DeezerPlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?deezer\.com/playlist/(?P[0-9]+)' _TEST = { 'url': 'http://www.deezer.com/playlist/176747451', 'info_dict': { 'id': '176747451', 'title': 'Best!', 'uploader': 'Anonymous', 'thumbnail': 're:^https?://cdn-images.deezer.com/images/cover/.*\.jpg$', }, 'playlist_count': 30, 'skip': 'Only available in .de', } def _real_extract(self, url): if 'test' not in self._downloader.params: self._downloader.report_warning('For now, this extractor only supports the 30 second previews. Patches welcome!') mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') webpage = self._download_webpage(url, playlist_id) geoblocking_msg = self._html_search_regex( r'

    (.*?)

    ', webpage, 'geoblocking message', default=None) if geoblocking_msg is not None: raise ExtractorError( 'Deezer said: %s' % geoblocking_msg, expected=True) data_json = self._search_regex( r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n', webpage, 'data JSON') data = json.loads(data_json) playlist_title = data.get('DATA', {}).get('TITLE') playlist_uploader = data.get('DATA', {}).get('PARENT_USERNAME') playlist_thumbnail = self._search_regex( r']* id="challenge"', login_results) is not None: tfa_code = self._get_tfa_info('2-step verification code') if not tfa_code: self._downloader.report_warning( 'Two-factor authentication required. Provide it either interactively or with --twofactor ' '(Note that only TOTP (Google Authenticator App) codes work at this time.)') return False tfa_code = remove_start(tfa_code, 'G-') tfa_form_strs = self._form_hidden_inputs('challenge', login_results) tfa_form_strs.update({ 'Pin': tfa_code, 'TrustDevice': 'on', }) tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii') tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data) tfa_results = self._download_webpage( tfa_req, None, note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) if tfa_results is False: return False if re.search(r'(?i)]* id="challenge"', tfa_results) is not None: self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') return False if re.search(r'(?i)]* id="gaia_loginform"', tfa_results) is not None: self._downloader.report_warning('unable to log in - did the page structure change?') return False if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') return False if re.search(r'(?i)]* id="gaia_loginform"', login_results) is not None: self._downloader.report_warning('unable to log in: bad username or password') return False return True def _real_initialize(self): if self._downloader is None: return self._set_language() if not self._login(): return class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Extract entries from page with "Load more" button def _entries(self, page, playlist_id): more_widget_html = content_html = page for page_num in itertools.count(1): for entry in self._process_page(content_html): yield entry mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) if not mobj: break more = self._download_json( 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num, transform_source=uppercase_escape) content_html = more['content_html'] if not content_html.strip(): # Some webpages show a "Load more" button but they don't # have more videos break more_widget_html = more['load_more_widget_html'] class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): for video_id, video_title in self.extract_videos_from_page(content): yield self.url_result(video_id, 'Youtube', video_id, video_title) def extract_videos_from_page(self, page): ids_in_page = [] titles_in_page = [] for mobj in re.finditer(self._VIDEO_RE, page): # The link with index 0 is not the first video of the playlist (not sure if still actual) if 'index' in mobj.groupdict() and mobj.group('id') == '0': continue video_id = mobj.group('id') video_title = unescapeHTML(mobj.group('title')) if video_title: video_title = video_title.strip() try: idx = ids_in_page.index(video_id) if video_title and not titles_in_page[idx]: titles_in_page[idx] = video_title except ValueError: ids_in_page.append(video_id) titles_in_page.append(video_title) return zip(ids_in_page, titles_in_page) class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) title = self._og_search_title(webpage, fatal=False) return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title) class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ ( (?:https?://|//) # http(s):// or protocol-independent URL (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?pwnyoutube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) v= ) )) |(?: youtu\.be| # just youtu.be/xxxx vid\.plus # or vid.plus/xxxx )/ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?!.*?&list=) # combined list/video URLs are handled by the playlist IE (?(1).+)? # if we found the ID, everything can follow $""" _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, # 3D videos '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, # Apple HTTP Live Streaming '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, # DASH mp4 video '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Dash mp4 audio '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'}, '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40}, '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, # Dash webm audio '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, # Dash webm audio with opus inside '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50}, '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50}, '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50}, # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, } _SUBTITLE_FORMATS = ('ttml', 'vtt') IE_NAME = 'youtube' _TESTS = [ { 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'like_count': int, 'dislike_count': int, 'start_time': 1, 'end_time': 9, } }, { 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY', 'note': 'Test generic use_cipher_signature video (#897)', 'info_dict': { 'id': 'UxxajLWwzqY', 'ext': 'mp4', 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', 'alt_title': 'I Love It (feat. Charli XCX)', 'description': 'md5:782e8651347686cba06e58f71ab51773', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', 'creator': 'Icona Pop', } }, { 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ', 'note': 'Test VEVO video with age protection (#956)', 'info_dict': { 'id': '07FYdnEawAQ', 'ext': 'mp4', 'upload_date': '20130703', 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', 'alt_title': 'Tunnel Vision', 'description': 'md5:64249768eec3bc4276236606ea996373', 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', 'creator': 'Justin Timberlake', 'age_limit': 18, } }, { 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', 'note': 'Embed-only video (#1746)', 'info_dict': { 'id': 'yZIXLfi8CZQ', 'ext': 'mp4', 'upload_date': '20120608', 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', 'uploader': 'SET India', 'uploader_id': 'setindia', 'age_limit': 18, } }, { 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', 'note': 'Use the first video ID in the URL', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'like_count': int, 'dislike_count': int, }, 'params': { 'skip_download': True, }, }, { 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I', 'note': '256k DASH audio (format 141) via DASH manifest', 'info_dict': { 'id': 'a9LDPn-MO4I', 'ext': 'm4a', 'upload_date': '20121002', 'uploader_id': '8KVIDEO', 'description': '', 'uploader': '8KVIDEO', 'title': 'UHDTV TEST 8K VIDEO.mp4' }, 'params': { 'youtube_include_dash_manifest': True, 'format': '141', }, }, # DASH manifest with encrypted signature { 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', 'info_dict': { 'id': 'IB3lcPjvWLA', 'ext': 'm4a', 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', }, 'params': { 'youtube_include_dash_manifest': True, 'format': '141', }, }, # JS player signature function name containing $ { 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM', 'info_dict': { 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', 'alt_title': 'Shake It Off', 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', 'creator': 'Taylor Swift', }, 'params': { 'youtube_include_dash_manifest': True, 'format': '141', }, }, # Controversy video { 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', 'info_dict': { 'id': 'T4XJQO3qol8', 'ext': 'mp4', 'upload_date': '20100909', 'uploader': 'The Amazing Atheist', 'uploader_id': 'TheAmazingAtheist', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } }, # Normal age-gate video (No vevo, embed allowed) { 'url': 'http://youtube.com/watch?v=HtVdAasjOgU', 'info_dict': { 'id': 'HtVdAasjOgU', 'ext': 'mp4', 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', 'upload_date': '20140605', 'age_limit': 18, }, }, # Age-gate video with encrypted signature { 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU', 'info_dict': { 'id': '6kLq3WMV1nU', 'ext': 'mp4', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', 'upload_date': '20110629', 'age_limit': 18, }, }, # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) { 'url': '__2ABJjxzNo', 'info_dict': { 'id': '__2ABJjxzNo', 'ext': 'mp4', 'upload_date': '20100430', 'uploader_id': 'deadmau5', 'creator': 'deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', 'alt_title': 'Some Chords', }, 'expected_warnings': [ 'DASH manifest missing', ] }, # Olympics (https://github.com/rg3/youtube-dl/issues/4431) { 'url': 'lqQg6PlCWgI', 'info_dict': { 'id': 'lqQg6PlCWgI', 'ext': 'mp4', 'upload_date': '20150827', 'uploader_id': 'olympic', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': 'Olympics', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', }, 'params': { 'skip_download': 'requires avconv', } }, # Non-square pixels { 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', 'info_dict': { 'id': '_b-2C3KPAM0', 'ext': 'mp4', 'stretched_ratio': 16 / 9., 'upload_date': '20110310', 'uploader_id': 'AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', 'uploader': '孫艾倫', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', }, }, # url_encoded_fmt_stream_map is empty string { 'url': 'qEJwOuvDf7I', 'info_dict': { 'id': 'qEJwOuvDf7I', 'ext': 'webm', 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', 'description': '', 'upload_date': '20150404', 'uploader_id': 'spbelect', 'uploader': 'Наблюдатели Петербурга', }, 'params': { 'skip_download': 'requires avconv', }, 'skip': 'This live event has ended.', }, # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097) { 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', 'info_dict': { 'id': 'FIl7x6_3R5Y', 'ext': 'mp4', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'upload_date': '20150625', 'uploader_id': 'dorappi2000', 'uploader': 'dorappi2000', 'formats': 'mincount:33', }, }, # DASH manifest with segment_list { 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', 'md5': '8ce563a1d667b599d21064e982ab9e31', 'info_dict': { 'id': 'CsmdDsKjzN8', 'ext': 'mp4', 'upload_date': '20150501', # According to '[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) player_type = id_m.group('ext') player_id = id_m.group('id') # Read from filesystem cache func_id = '%s_%s_%s' % ( player_type, player_id, self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) download_note = ( 'Downloading player %s' % player_url if self._downloader.params.get('verbose') else 'Downloading %s player %s' % (player_type, player_id) ) if player_type == 'js': code = self._download_webpage( player_url, video_id, note=download_note, errnote='Download of %s failed' % player_url) res = self._parse_sig_js(code) elif player_type == 'swf': urlh = self._request_webpage( player_url, video_id, note=download_note, errnote='Download of %s failed' % player_url) code = urlh.read() res = self._parse_sig_swf(code) else: assert False, 'Invalid player type %r' % player_type test_string = ''.join(map(compat_chr, range(len(example_sig)))) cache_res = res(test_string) cache_spec = [ord(c) for c in cache_res] self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) return res def _print_sig_code(self, func, example_sig): def gen_sig_code(idxs): def _genslice(start, end, step): starts = '' if start == 0 else str(start) ends = (':%d' % (end + step)) if end + step >= 0 else ':' steps = '' if step == 1 else (':%d' % step) return 's[%s%s%s]' % (starts, ends, steps) step = None # Quelch pyflakes warnings - start will be set when step is set start = '(Never used)' for i, prev in zip(idxs[1:], idxs[:-1]): if step is not None: if i - prev == step: continue yield _genslice(start, prev, step) step = None continue if i - prev in [-1, 1]: step = i - prev start = prev continue else: yield 's[%d]' % prev if step is None: yield 's[%d]' % i else: yield _genslice(start, i, step) test_string = ''.join(map(compat_chr, range(len(example_sig)))) cache_res = func(test_string) cache_spec = [ord(c) for c in cache_res] expr_code = ' + '.join(gen_sig_code(cache_spec)) signature_id_tuple = '(%s)' % ( ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' ' return %s\n') % (signature_id_tuple, expr_code) self.to_screen('Extracted signature function:\n' + code) def _parse_sig_js(self, jscode): funcname = self._search_regex( r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode, 'Initial JS player signature function name') jsi = JSInterpreter(jscode) initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) def _parse_sig_swf(self, file_contents): swfi = SWFInterpreter(file_contents) TARGET_CLASSNAME = 'SignatureDecipher' searched_class = swfi.extract_class(TARGET_CLASSNAME) initial_function = swfi.extract_function(searched_class, 'decipher') return lambda s: initial_function([s]) def _decrypt_signature(self, s, video_id, player_url, age_gate=False): """Turn the encrypted s field into a working signature""" if player_url is None: raise ExtractorError('Cannot decrypt signature without player_url') if player_url.startswith('//'): player_url = 'https:' + player_url try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: func = self._extract_signature_function( video_id, player_url, s ) self._player_cache[player_id] = func func = self._player_cache[player_id] if self._downloader.params.get('youtube_print_sig_code'): self._print_sig_code(func, s) return func(s) except Exception as e: tb = traceback.format_exc() raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) def _get_subtitles(self, video_id, webpage): try: subs_doc = self._download_xml( 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, video_id, note=False) except ExtractorError as err: self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) return {} sub_lang_list = {} for track in subs_doc.findall('track'): lang = track.attrib['lang_code'] if lang in sub_lang_list: continue sub_formats = [] for ext in self._SUBTITLE_FORMATS: params = compat_urllib_parse.urlencode({ 'lang': lang, 'v': video_id, 'fmt': ext, 'name': track.attrib['name'].encode('utf-8'), }) sub_formats.append({ 'url': 'https://www.youtube.com/api/timedtext?' + params, 'ext': ext, }) sub_lang_list[lang] = sub_formats if not sub_lang_list: self._downloader.report_warning('video doesn\'t have subtitles') return {} return sub_lang_list def _get_ytplayer_config(self, video_id, webpage): patterns = ( # User data may contain arbitrary character sequences that may affect # JSON extraction with regex, e.g. when '};' is contained the second # regex won't capture the whole JSON. Yet working around by trying more # concrete regex first keeping in mind proper quoted string handling # to be implemented in future that will replace this workaround (see # https://github.com/rg3/youtube-dl/issues/7468, # https://github.com/rg3/youtube-dl/pull/7599) r';ytplayer\.config\s*=\s*({.+?});ytplayer', r';ytplayer\.config\s*=\s*({.+?});', ) config = self._search_regex( patterns, webpage, 'ytplayer.config', default=None) if config: return self._parse_json( uppercase_escape(config), video_id, fatal=False) def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" self.to_screen('%s: Looking for automatic captions' % video_id) player_config = self._get_ytplayer_config(video_id, webpage) err_msg = 'Couldn\'t find automatic captions for %s' % video_id if not player_config: self._downloader.report_warning(err_msg) return {} try: args = player_config['args'] caption_url = args['ttsurl'] if not caption_url: self._downloader.report_warning(err_msg) return {} timestamp = args['timestamp'] # We get the available subtitles list_params = compat_urllib_parse.urlencode({ 'type': 'list', 'tlangs': 1, 'asrs': 1, }) list_url = caption_url + '&' + list_params caption_list = self._download_xml(list_url, video_id) original_lang_node = caption_list.find('track') if original_lang_node is None: self._downloader.report_warning('Video doesn\'t have automatic captions') return {} original_lang = original_lang_node.attrib['lang_code'] caption_kind = original_lang_node.attrib.get('kind', '') sub_lang_list = {} for lang_node in caption_list.findall('target'): sub_lang = lang_node.attrib['lang_code'] sub_formats = [] for ext in self._SUBTITLE_FORMATS: params = compat_urllib_parse.urlencode({ 'lang': original_lang, 'tlang': sub_lang, 'fmt': ext, 'ts': timestamp, 'kind': caption_kind, }) sub_formats.append({ 'url': caption_url + '&' + params, 'ext': ext, }) sub_lang_list[sub_lang] = sub_formats return sub_lang_list # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles except (KeyError, ExtractorError): self._downloader.report_warning(err_msg) return {} @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) if mobj is None: raise ExtractorError('Invalid URL: %s' % url) video_id = mobj.group(2) return video_id def _extract_from_m3u8(self, manifest_url, video_id): url_map = {} def _get_urls(_manifest): lines = _manifest.split('\n') urls = filter(lambda l: l and not l.startswith('#'), lines) return urls manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest') formats_urls = _get_urls(manifest) for format_url in formats_urls: itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag') url_map[itag] = format_url return url_map def _extract_annotations(self, video_id): url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) proto = ( 'http' if self._downloader.params.get('prefer_insecure', False) else 'https') start_time = None end_time = None parsed_url = compat_urllib_parse_urlparse(url) for component in [parsed_url.fragment, parsed_url.query]: query = compat_parse_qs(component) if start_time is None and 't' in query: start_time = parse_duration(query['t'][0]) if start_time is None and 'start' in query: start_time = parse_duration(query['start'][0]) if end_time is None and 'end' in query: end_time = parse_duration(query['end'][0]) # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') video_id = self.extract_id(url) # Get video webpage url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) if mobj is not None: player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) else: player_url = None dash_mpds = [] def add_dash_mpd(video_info): dash_mpd = video_info.get('dashmpd') if dash_mpd and dash_mpd[0] not in dash_mpds: dash_mpds.append(dash_mpd[0]) # Get video info embed_webpage = None is_live = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube url = proto + '://www.youtube.com/embed/%s' % video_id embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') data = compat_urllib_parse.urlencode({ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data video_info_webpage = self._download_webpage( video_info_url, video_id, note='Refetching age-gated info webpage', errnote='unable to download video info webpage') video_info = compat_parse_qs(video_info_webpage) add_dash_mpd(video_info) else: age_gate = False video_info = None # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: args = ytplayer_config['args'] if args.get('url_encoded_fmt_stream_map'): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH # manifest pointed by get_video_info's dashmpd). # The general idea is to take a union of itags of both DASH manifests (for example # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) self.report_video_info_webpage_download(video_id) for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: video_info_url = ( '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' % (proto, video_id, el_type)) video_info_webpage = self._download_webpage( video_info_url, video_id, note=False, errnote='unable to download video info webpage') get_video_info = compat_parse_qs(video_info_webpage) if get_video_info.get('use_cipher_signature') != ['True']: add_dash_mpd(get_video_info) if not video_info: video_info = get_video_info if 'token' in get_video_info: # Different get_video_info requests may report different results, e.g. # some may report video unavailability, but some may serve it without # any complaint (see https://github.com/rg3/youtube-dl/issues/7362, # the original webpage as well as el=info and el=embedded get_video_info # requests report video unavailability due to geo restriction while # el=detailpage succeeds and returns valid data). This is probably # due to YouTube measures against IP ranges of hosting providers. # Working around by preferring the first succeeded video_info containing # the token if no such video_info yet was found. if 'token' not in video_info: video_info = get_video_info break if 'token' not in video_info: if 'reason' in video_info: if 'The uploader has not made this video available in your country.' in video_info['reason']: regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) if regions_allowed: raise ExtractorError('YouTube said: This video is available in %s only' % ( ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), expected=True) raise ExtractorError( 'YouTube said: %s' % video_info['reason'][0], expected=True, video_id=video_id) else: raise ExtractorError( '"token" parameter not in video info for unknown reason', video_id=video_id) # title if 'title' in video_info: video_title = video_info['title'][0] else: self._downloader.report_warning('Unable to extract video title') video_title = '_' # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: video_description = re.sub(r'''(?x) ]*> [^<]+\.{3}\s* ''', r'\1', video_description) video_description = clean_html(video_description) else: fd_mobj = re.search(r'', video_webpage) if mobj is not None: video_uploader_id = mobj.group(1) else: self._downloader.report_warning('unable to extract uploader nickname') # thumbnail image # We try first to get a high quality image: m_thumb = re.search(r'', video_webpage, re.DOTALL) if m_thumb is not None: video_thumbnail = m_thumb.group(1) elif 'thumbnail_url' not in video_info: self._downloader.report_warning('unable to extract video thumbnail') video_thumbnail = None else: # don't panic if we can't find it video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) # upload date upload_date = self._html_search_meta( 'datePublished', video_webpage, 'upload date', default=None) if not upload_date: upload_date = self._search_regex( [r'(?s)id="eow-date.*?>(.*?)', r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)'], video_webpage, 'upload date', default=None) if upload_date: upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) m_music = re.search( r']+class="title"[^>]*>\s*Music\s*\s*]*>\s*
  • (?P.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', video_webpage) if m_music: video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) video_creator = clean_html(m_music.group('creator')) else: video_alt_title = video_creator = None m_cat_container = self._search_regex( r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', video_webpage, 'categories', default=None) if m_cat_container: category = self._html_search_regex( r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', default=None) video_categories = None if category is None else [category] else: video_categories = None video_tags = [ unescapeHTML(m.group('content')) for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] def _extract_count(count_name): return str_to_int(self._search_regex( r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name), video_webpage, count_name, default=None)) like_count = _extract_count('like') dislike_count = _extract_count('dislike') # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage) if 'length_seconds' not in video_info: self._downloader.report_warning('unable to extract video duration') video_duration = None else: video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0])) # annotations video_annotations = None if self._downloader.params.get('writeannotations', False): video_annotations = self._extract_annotations(video_id) def _map_to_format_list(urlmap): formats = [] for itag, video_real_url in urlmap.items(): dct = { 'format_id': itag, 'url': video_real_url, 'player_url': player_url, } if itag in self._formats: dct.update(self._formats[itag]) formats.append(dct) return formats if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() formats = [{ 'format_id': '_rtmp', 'protocol': 'rtmp', 'url': video_info['conn'][0], 'player_url': player_url, }] elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1: encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) if 'itag' not in url_data or 'url' not in url_data: continue format_id = url_data['itag'][0] url = url_data['url'][0] if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] elif 's' in url_data: encrypted_sig = url_data['s'][0] ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' jsplayer_url_json = self._search_regex( ASSETS_RE, embed_webpage if age_gate else video_webpage, 'JS player URL (1)', default=None) if not jsplayer_url_json and not age_gate: # We need the embed website after all if embed_webpage is None: embed_url = proto + '://www.youtube.com/embed/%s' % video_id embed_webpage = self._download_webpage( embed_url, video_id, 'Downloading embed webpage') jsplayer_url_json = self._search_regex( ASSETS_RE, embed_webpage, 'JS player URL') player_url = json.loads(jsplayer_url_json) if player_url is None: player_url_json = self._search_regex( r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', video_webpage, 'age gate player URL') player_url = json.loads(player_url_json) if self._downloader.params.get('verbose'): if player_url is None: player_version = 'unknown' player_desc = 'unknown' else: if player_url.endswith('swf'): player_version = self._search_regex( r'-(.+?)(?:/watch_as3)?\.swf$', player_url, 'flash player', fatal=False) player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version parts_sizes = self._signature_cache_id(encrypted_sig) self.to_screen('{%s} signature length %s, %s' % (format_id, parts_sizes, player_desc)) signature = self._decrypt_signature( encrypted_sig, video_id, player_url, age_gate) url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' dct = { 'format_id': format_id, 'url': url, 'player_url': player_url, } if format_id in self._formats: dct.update(self._formats[format_id]) # Some itags are not included in DASH manifest thus corresponding formats will # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993). # Trying to extract metadata from url_encoded_fmt_stream_map entry. mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) more_fields = { 'filesize': int_or_none(url_data.get('clen', [None])[0]), 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'width': width, 'height': height, 'fps': int_or_none(url_data.get('fps', [None])[0]), 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], } for key, value in more_fields.items(): if value: dct[key] = value type_ = url_data.get('type', [None])[0] if type_: type_split = type_.split(';') kind_ext = type_split[0].split('/') if len(kind_ext) == 2: kind, _ = kind_ext dct['ext'] = mimetype2ext(type_split[0]) if kind in ('audio', 'video'): codecs = None for mobj in re.finditer( r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_): if mobj.group('key') == 'codecs': codecs = mobj.group('val') break if codecs: codecs = codecs.split(',') if len(codecs) == 2: acodec, vcodec = codecs[1], codecs[0] else: acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0]) dct.update({ 'acodec': acodec, 'vcodec': vcodec, }) formats.append(dct) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) formats = _map_to_format_list(url_map) # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming for a_format in formats: a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' else: unavailable_message = self._html_search_regex( r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>', video_webpage, 'unavailable message', default=None) if unavailable_message: raise ExtractorError(unavailable_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): dash_mpd_fatal = True for mpd_url in dash_mpds: dash_formats = {} try: def decrypt_sig(mobj): s = mobj.group(1) dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) return '/signature/%s' % dec_s mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url) for df in self._extract_mpd_formats( mpd_url, video_id, fatal=dash_mpd_fatal, formats_dict=self._formats): # Do not overwrite DASH format found in some previous DASH manifest if df['format_id'] not in dash_formats: dash_formats[df['format_id']] = df # Additional DASH manifests may end up in HTTP Error 403 therefore # allow them to fail without bug report message if we already have # some DASH manifest succeeded. This is temporary workaround to reduce # burst of bug reports until we figure out the reason and whether it # can be fixed at all. dash_mpd_fatal = False except (ExtractorError, KeyError) as e: self.report_warning( 'Skipping DASH manifest: %r' % e, video_id) if dash_formats: # Remove the formats we found through non-DASH, they # contain less info and it can be wrong, because we use # fixed values (for example the resolution). See # https://github.com/rg3/youtube-dl/issues/5774 for an # example. formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] formats.extend(dash_formats.values()) # Check for malformed aspect ratio stretched_m = re.search( r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', video_webpage) if stretched_m: w = float(stretched_m.group('w')) h = float(stretched_m.group('h')) # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). # We will only process correct ratios. if w > 0 and h > 0: ratio = w / h for f in formats: if f.get('vcodec') != 'none': f['stretched_ratio'] = ratio self._sort_formats(formats) return { 'id': video_id, 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'upload_date': upload_date, 'creator': video_creator, 'title': video_title, 'alt_title': video_alt_title, 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, 'tags': video_tags, 'subtitles': video_subtitles, 'automatic_captions': automatic_captions, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]), 'formats': formats, 'is_live': is_live, 'start_time': start_time, 'end_time': end_time, } class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: (?:https?://)? (?:\w+\.)? youtube\.com/ (?: (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) \? (?:.*?[&;])*? (?:p|a|list)= | p/ ) ( (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', 'info_dict': { 'title': 'ytdl test PL', 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', }, 'playlist_count': 3, }, { 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', 'info_dict': { 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', 'title': 'YDL_Empty_List', }, 'playlist_count': 0, }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'info_dict': { 'title': '29C3: Not my department', 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', }, 'playlist_count': 95, }, { 'note': 'issue #673', 'url': 'PLBB231211A4F62143', 'info_dict': { 'title': '[OLD]Team Fortress 2 (Class-based LP)', 'id': 'PLBB231211A4F62143', }, 'playlist_mincount': 26, }, { 'note': 'Large playlist', 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', 'info_dict': { 'title': 'Uploads from Cauchemar', 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', }, 'playlist_mincount': 799, }, { 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', 'info_dict': { 'title': 'YDL_safe_search', 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', }, 'playlist_count': 2, }, { 'note': 'embedded', 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', 'playlist_count': 4, 'info_dict': { 'title': 'JODA15', 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', } }, { 'note': 'Embedded SWF player', 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', 'playlist_count': 4, 'info_dict': { 'title': 'JODA7', 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', } }, { 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', 'info_dict': { 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincout': 21, }] def _real_initialize(self): self._login() def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage( url, playlist_id, 'Downloading Youtube mix') search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) title_span = ( search_title('playlist-title') or search_title('title long-title') or search_title('title')) title = clean_html(title_span) ids = orderedSet(re.findall( r'''(?xs)data-video-username=".*?".*? href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), webpage)) url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) def _extract_playlist(self, playlist_id): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page): match = match.strip() # Check if the playlist exists or is private if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match): raise ExtractorError( 'The playlist doesn\'t exist or is private, use --username or ' '--netrc to access it.', expected=True) elif re.match(r'[^<]*Invalid parameters[^<]*', match): raise ExtractorError( 'Invalid parameters. Maybe URL is incorrect.', expected=True) elif re.match(r'[^<]*Choose your language[^<]*', match): continue else: self.report_warning('Youtube gives an alert message: ' + match) playlist_title = self._html_search_regex( r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', page, 'title') return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) def _check_download_just_video(self, url, playlist_id): # Check if it's a video-specific URL query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) if 'v' in query_dict: video_id = query_dict['v'][0] if self._downloader.params.get('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) return self.url_result(video_id, 'Youtube', video_id=video_id) else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) def _real_extract(self, url): # Extract playlist id mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError('Invalid URL: %s' % url) playlist_id = mobj.group(1) or mobj.group(2) video = self._check_download_just_video(url, playlist_id) if video: return video if playlist_id.startswith('RD') or playlist_id.startswith('UL'): # Mixes require a custom extraction process return self._extract_mix(playlist_id) return self._extract_playlist(playlist_id) class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' _TESTS = [{ 'note': 'paginated channel', 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'playlist_mincount': 91, 'info_dict': { 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', 'title': 'Uploads from lex will', } }, { 'note': 'Age restricted channel', # from https://www.youtube.com/user/DeusExOfficial 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', 'playlist_mincount': 64, 'info_dict': { 'id': 'UUs0ifCMCm1icqRbqhUINa0w', 'title': 'Uploads from Deus Ex', }, }] @classmethod def suitable(cls, url): return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url) def _real_extract(self, url): channel_id = self._match_id(url) url = self._TEMPLATE_URL % channel_id # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) # Workaround by extracting as a playlist if managed to obtain channel playlist URL # otherwise fallback on channel by page extraction channel_page = self._download_webpage( url + '?view=57', channel_id, 'Downloading channel page', fatal=False) if channel_page is False: channel_playlist_id = False else: channel_playlist_id = self._html_search_meta( 'channelId', channel_page, 'channel id', default=None) if not channel_playlist_id: channel_playlist_id = self._search_regex( r'data-(?:channel-external-|yt)id="([^"]+)"', channel_page, 'channel id', default=None) if channel_playlist_id and channel_playlist_id.startswith('UC'): playlist_id = 'UU' + channel_playlist_id[2:] return self.url_result( compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') autogenerated = re.search(r'''(?x) class="[^"]*?(?: channel-header-autogenerated-label| yt-channel-title-autogenerated )[^"]*"''', channel_page) is not None if autogenerated: # The videos are contained in a single page # the ajax pages can't be used, they are empty entries = [ self.url_result( video_id, 'Youtube', video_id=video_id, video_title=video_title) for video_id, video_title in self.extract_videos_from_page(channel_page)] return self.playlist_result(entries, channel_id) return self.playlist_result(self._entries(channel_page, channel_id), channel_id) class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos' IE_NAME = 'youtube:user' _TESTS = [{ 'url': 'https://www.youtube.com/user/TheLinuxFoundation', 'playlist_mincount': 320, 'info_dict': { 'title': 'TheLinuxFoundation', } }, { 'url': 'ytuser:phihag', 'only_matching': True, }] @classmethod def suitable(cls, url): # Don't return True if the url can be extracted with other youtube # extractor, the regex would is too permissive and it would match. other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls) if any(ie.suitable(url) for ie in other_ies): return False else: return super(YoutubeUserIE, cls).suitable(url) class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' IE_NAME = 'youtube:playlists' _TESTS = [{ 'url': 'http://www.youtube.com/user/ThirstForScience/playlists', 'playlist_mincount': 4, 'info_dict': { 'id': 'ThirstForScience', 'title': 'Thirst for Science', }, }, { # with "Load more" button 'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', 'playlist_mincount': 70, 'info_dict': { 'id': 'igorkle1', 'title': 'Игорь Клейнер', }, }, { 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', 'playlist_mincount': 17, 'info_dict': { 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', 'title': 'Chem Player', }, }] class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for # 'python' you get more than 8.000.000 results _MAX_RESULTS = float('inf') IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' _EXTRA_QUERY_ARGS = {} _TESTS = [] def _get_n_results(self, query, n): """Get a specified number of results for a query""" videos = [] limit = n for pagenum in itertools.count(1): url_query = { 'search_query': query.encode('utf-8'), 'page': pagenum, 'spf': 'navigate', } url_query.update(self._EXTRA_QUERY_ARGS) result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query) data = self._download_json( result_url, video_id='query "%s"' % query, note='Downloading page %s' % pagenum, errnote='Unable to download API page') html_content = data[1]['body']['content'] if 'class="search-message' in html_content: raise ExtractorError( '[youtube] No video results', expected=True) new_videos = self._ids_to_results(orderedSet(re.findall( r'href="/watch\?v=(.{11})', html_content))) videos += new_videos if not new_videos or len(videos) > limit: break if len(videos) > n: videos = videos[:n] return self.playlist_result(videos, query) class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube.com searches, newest videos first' _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} class YoutubeSearchURLIE(InfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, 'info_dict': { 'title': 'youtube-dl test video', } }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) result_code = self._search_regex( r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML') part_codes = re.findall( r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code) entries = [] for part_code in part_codes: part_title = self._html_search_regex( [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False) part_url_snippet = self._html_search_regex( r'(?s)href="([^"]+)"', part_code, 'item URL') part_url = compat_urlparse.urljoin( 'https://www.youtube.com/', part_url_snippet) entries.append({ '_type': 'url', 'url': part_url, 'title': part_title, }) return { '_type': 'playlist', 'entries': entries, 'title': query, } class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com (multi-season) shows' _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)' IE_NAME = 'youtube:show' _TESTS = [{ 'url': 'https://www.youtube.com/show/airdisasters', 'playlist_mincount': 5, 'info_dict': { 'id': 'airdisasters', 'title': 'Air Disasters', } }] def _real_extract(self, url): playlist_id = self._match_id(url) return super(YoutubeShowIE, self)._real_extract( 'https://www.youtube.com/show/%s/playlists' % playlist_id) class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ Base class for feed extractors Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True @property def IE_NAME(self): return 'youtube:%s' % self._FEED_NAME def _real_initialize(self): self._login() def _real_extract(self, url): page = self._download_webpage( 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index ids = [] more_widget_html = content_html = page for page_num in itertools.count(1): matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) # 'recommended' feed has infinite 'load more' and each new portion spins # the same videos in (sometimes) slightly different order, so we'll check # for unicity and break when portion has no new videos new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches)) if not new_ids: break ids.extend(new_ids) mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) if not mobj: break more = self._download_json( 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape) content_html = more['content_html'] more_widget_html = more['load_more_widget_html'] return self.playlist_result( self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE) class YoutubeWatchLaterIE(YoutubePlaylistIE): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=WL', 'only_matching': True, }, { 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', 'only_matching': True, }] def _real_extract(self, url): video = self._check_download_just_video(url, 'WL') if video: return video return self._extract_playlist('WL') class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:favorites' IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?' _LOGIN_REQUIRED = True def _real_extract(self, url): webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') return self.url_result(playlist_id, 'YoutubePlaylist') class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' _FEED_NAME = 'history' _PLAYLIST_TITLE = 'Youtube History' class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list _VALID_URL = r'''(?x) (?:https?://)? (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ (?:watch\?(?: feature=[a-z_]+| annotation_id=annotation_[^&]+| x-yt-cl=[0-9]+| hl=[^&]*| t=[0-9]+ )? | attribution_link\?a=[^&]+ ) $ ''' _TESTS = [{ 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041', 'only_matching': True, }, { 'url': 'http://www.youtube.com/watch?', 'only_matching': True, }, { 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', 'only_matching': True, }, { 'url': 'https://www.youtube.com/watch?feature=foo', 'only_matching': True, }, { 'url': 'https://www.youtube.com/watch?hl=en-GB', 'only_matching': True, }, { 'url': 'https://www.youtube.com/watch?t=2372', 'only_matching': True, }] def _real_extract(self, url): raise ExtractorError( 'Did you forget to quote the URL? Remember that & is a meta ' 'character in most shells, so you want to put the URL in quotes, ' 'like youtube-dl ' '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' ' or simply youtube-dl BaW_jenozKc .', expected=True) class YoutubeTruncatedIDIE(InfoExtractor): IE_NAME = 'youtube:truncated_id' IE_DESC = False # Do not list _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' _TESTS = [{ 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) raise ExtractorError( 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), expected=True) ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/brightcove.py�������������������������������������������������������0000644�0000000�0000000�00000054452�12641030331�020544� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals import re import json from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, compat_parse_qs, compat_str, compat_urllib_parse, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, ) from ..utils import ( determine_ext, ExtractorError, find_xpath_attr, fix_xml_ampersands, float_or_none, js_to_json, int_or_none, parse_iso8601, sanitized_Request, unescapeHTML, unsmuggle_url, ) class BrightcoveLegacyIE(InfoExtractor): IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)' _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' _TESTS = [ { # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', 'md5': '5423e113865d26e40624dce2e4b45d95', 'note': 'Test Brightcove downloads and detection in GenericIE', 'info_dict': { 'id': '2371591881001', 'ext': 'mp4', 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', 'uploader': '8TV', 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', } }, { # From http://medianetwork.oracle.com/video/player/1785452137001 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', 'info_dict': { 'id': '1785452137001', 'ext': 'flv', 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', 'uploader': 'Oracle', }, }, { # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ 'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', 'info_dict': { 'id': '2750934548001', 'ext': 'mp4', 'title': 'This Bracelet Acts as a Personal Thermostat', 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', 'uploader': 'Mashable', }, }, { # test that the default referer works # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/ 'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001', 'info_dict': { 'id': '2878862109001', 'ext': 'mp4', 'title': 'Lost in Motion II', 'description': 'md5:363109c02998fee92ec02211bd8000df', 'uploader': 'National Ballet of Canada', }, }, { # test flv videos served by akamaihd.net # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3ABC2996102916001&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', # The md5 checksum changes on each download 'info_dict': { 'id': '2996102916001', 'ext': 'flv', 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', 'uploader': 'Red Bull TV', 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', }, }, { # playlist test # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', 'info_dict': { 'title': 'Sealife', 'id': '3550319591001', }, 'playlist_mincount': 7, }, ] @classmethod def _build_brighcove_url(cls, object_str): """ Build a Brightcove url from a xml string containing <object class="BrightcoveExperience">{params}</object> """ # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553 object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>', lambda m: m.group(1) + '/>', object_str) # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 object_str = object_str.replace('<--', '<!--') # remove namespace to simplify extraction object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str) object_str = fix_xml_ampersands(object_str) try: object_doc = compat_etree_fromstring(object_str.encode('utf-8')) except compat_xml_parse_error: return fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') if fv_el is not None: flashvars = dict( (k, v[0]) for k, v in compat_parse_qs(fv_el.attrib['value']).items()) else: flashvars = {} def find_param(name): if name in flashvars: return flashvars[name] node = find_xpath_attr(object_doc, './param', 'name', name) if node is not None: return node.attrib['value'] return None params = {} playerID = find_param('playerID') if playerID is None: raise ExtractorError('Cannot find player ID') params['playerID'] = playerID playerKey = find_param('playerKey') # Not all pages define this value if playerKey is not None: params['playerKey'] = playerKey # The three fields hold the id of the video videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') if videoPlayer is not None: params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') if linkBase is not None: params['linkBaseURL'] = linkBase return cls._make_brightcove_url(params) @classmethod def _build_brighcove_url_from_js(cls, object_js): # The layout of JS is as follows: # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { # // build Brightcove <object /> XML # } m = re.search( r'''(?x)customBC.\createVideo\( .*? # skipping width and height ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters # in length, however it's appended to itself # in places, so truncate ["\'](?P<videoID>\d+)["\'] # @videoPlayer ''', object_js) if m: return cls._make_brightcove_url(m.groupdict()) @classmethod def _make_brightcove_url(cls, params): data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data @classmethod def _extract_brightcove_url(cls, webpage): """Try to extract the brightcove url from the webpage, returns None if it can't be found """ urls = cls._extract_brightcove_urls(webpage) return urls[0] if urls else None @classmethod def _extract_brightcove_urls(cls, webpage): """Return a list of all Brightcove URLs from the webpage """ url_m = re.search( r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure|c)\.brightcove.com/[^\'"]+)[\'"]', webpage) if url_m: url = unescapeHTML(url_m.group(1)) # Some sites don't add it, we can't download with this url, for example: # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ if 'playerKey' in url or 'videoId' in url: return [url] matches = re.findall( r'''(?sx)<object (?: [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] | [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ ).+?>\s*</object>''', webpage) if matches: return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) return list(filter(None, [ cls._build_brighcove_url_from_js(custom_bc) for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) # Change the 'videoId' and others field to '@videoPlayer' url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url) # Change bckey (used by bcove.me urls) to playerKey url = re.sub(r'(?<=[?&])bckey', 'playerKey', url) mobj = re.match(self._VALID_URL, url) query_str = mobj.group('query') query = compat_urlparse.parse_qs(query_str) videoPlayer = query.get('@videoPlayer') if videoPlayer: # We set the original url as the default 'Referer' header referer = smuggled_data.get('Referer', url) return self._get_video_info( videoPlayer[0], query_str, query, referer=referer) elif 'playerKey' in query: player_key = query['playerKey'] return self._get_playlist_info(player_key[0]) else: raise ExtractorError( 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', expected=True) def _get_video_info(self, video_id, query_str, query, referer=None): request_url = self._FEDERATED_URL_TEMPLATE % query_str req = sanitized_Request(request_url) linkBase = query.get('linkBaseURL') if linkBase is not None: referer = linkBase[0] if referer is not None: req.add_header('Referer', referer) webpage = self._download_webpage(req, video_id) error_msg = self._html_search_regex( r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage, 'error message', default=None) if error_msg is not None: raise ExtractorError( 'brightcove said: %s' % error_msg, expected=True) self.report_extraction(video_id) info = self._search_regex(r'var experienceJSON = ({.*});', webpage, 'json') info = json.loads(info)['data'] video_info = info['programmedContent']['videoPlayer']['mediaDTO'] video_info['_youtubedl_adServerURL'] = info.get('adServerURL') return self._extract_video_info(video_info) def _get_playlist_info(self, player_key): info_url = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' % player_key playlist_info = self._download_webpage( info_url, player_key, 'Downloading playlist information') json_data = json.loads(playlist_info) if 'videoList' not in json_data: raise ExtractorError('Empty playlist') playlist_info = json_data['videoList'] videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'], playlist_title=playlist_info['mediaCollectionDTO']['displayName']) def _extract_video_info(self, video_info): info = { 'id': compat_str(video_info['id']), 'title': video_info['displayName'].strip(), 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), 'uploader': video_info.get('publisherName'), } renditions = video_info.get('renditions') if renditions: formats = [] for rend in renditions: url = rend['defaultURL'] if not url: continue ext = None if rend['remote']: url_comp = compat_urllib_parse_urlparse(url) if url_comp.path.endswith('.m3u8'): formats.extend( self._extract_m3u8_formats(url, info['id'], 'mp4')) continue elif 'akamaihd.net' in url_comp.netloc: # This type of renditions are served through # akamaihd.net, but they don't use f4m manifests url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB' ext = 'flv' if ext is None: ext = determine_ext(url) size = rend.get('size') formats.append({ 'url': url, 'ext': ext, 'height': rend.get('frameHeight'), 'width': rend.get('frameWidth'), 'filesize': size if size != 0 else None, }) self._sort_formats(formats) info['formats'] = formats elif video_info.get('FLVFullLengthURL') is not None: info.update({ 'url': video_info['FLVFullLengthURL'], }) if self._downloader.params.get('include_ads', False): adServerURL = video_info.get('_youtubedl_adServerURL') if adServerURL: ad_info = { '_type': 'url', 'url': adServerURL, } if 'url' in info: return { '_type': 'playlist', 'title': info['title'], 'entries': [ad_info, info], } else: return ad_info if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info class BrightcoveNewIE(InfoExtractor): IE_NAME = 'brightcove:new' _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>(?:ref:)?\d+)' _TESTS = [{ 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', 'info_dict': { 'id': '4463358922001', 'ext': 'mp4', 'title': 'Meet the man behind Popcorn Time', 'description': 'md5:eac376a4fe366edc70279bfb681aea16', 'duration': 165.768, 'timestamp': 1441391203, 'upload_date': '20150904', 'uploader_id': '929656772001', 'formats': 'mincount:22', }, }, { # with rtmp streams 'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001', 'info_dict': { 'id': '4279049078001', 'ext': 'mp4', 'title': 'Titansgrave: Chapter 0', 'description': 'Titansgrave: Chapter 0', 'duration': 1242.058, 'timestamp': 1433556729, 'upload_date': '20150606', 'uploader_id': '4036320279001', 'formats': 'mincount:41', }, 'params': { 'skip_download': True, } }, { # ref: prefixed video id 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', 'only_matching': True, }] @staticmethod def _extract_url(webpage): urls = BrightcoveNewIE._extract_urls(webpage) return urls[0] if urls else None @staticmethod def _extract_urls(webpage): # Reference: # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player entries = [] # Look for iframe embeds [1] for _, url in re.findall( r'<iframe[^>]+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): entries.append(url) # Look for embed_in_page embeds [2] for video_id, account_id, player_id, embed in re.findall( # According to examples from [3] it's unclear whether video id # may be optional and what to do when it is # According to [4] data-video-id may be prefixed with ref: r'''(?sx) <video[^>]+ data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*? </video>.*? <script[^>]+ src=["\'](?:https?:)?//players\.brightcove\.net/ (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js ''', webpage): entries.append( 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id)) return entries def _real_extract(self, url): account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage( 'http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) policy_key = None catalog = self._search_regex( r'catalog\(({.+?})\);', webpage, 'catalog', default=None) if catalog: catalog = self._parse_json( js_to_json(catalog), video_id, fatal=False) if catalog: policy_key = catalog.get('policyKey') if not policy_key: policy_key = self._search_regex( r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', webpage, 'policy key', group='pk') req = sanitized_Request( 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id), headers={'Accept': 'application/json;pk=%s' % policy_key}) json_data = self._download_json(req, video_id) title = json_data['name'] formats = [] for source in json_data.get('sources', []): source_type = source.get('type') src = source.get('src') if source_type == 'application/x-mpegURL': if not src: continue formats.extend(self._extract_m3u8_formats( src, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) else: streaming_src = source.get('streaming_src') stream_name, app_name = source.get('stream_name'), source.get('app_name') if not src and not streaming_src and (not stream_name or not app_name): continue tbr = float_or_none(source.get('avg_bitrate'), 1000) height = int_or_none(source.get('height')) f = { 'tbr': tbr, 'width': int_or_none(source.get('width')), 'height': height, 'filesize': int_or_none(source.get('size')), 'container': source.get('container'), 'vcodec': source.get('codec'), 'ext': source.get('container').lower(), } def build_format_id(kind): format_id = kind if tbr: format_id += '-%dk' % int(tbr) if height: format_id += '-%dp' % height return format_id if src or streaming_src: f.update({ 'url': src or streaming_src, 'format_id': build_format_id('http' if src else 'http-streaming'), 'preference': 2 if src else 1, }) else: f.update({ 'url': app_name, 'play_path': stream_name, 'format_id': build_format_id('rtmp'), }) formats.append(f) self._sort_formats(formats) description = json_data.get('description') thumbnail = json_data.get('thumbnail') timestamp = parse_iso8601(json_data.get('published_at')) duration = float_or_none(json_data.get('duration'), 1000) tags = json_data.get('tags', []) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'timestamp': timestamp, 'uploader_id': account_id, 'formats': formats, 'tags': tags, } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/quickvid.py���������������������������������������������������������0000644�0000000�0000000�00000003267�12641030331�020225� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urlparse, ) from ..utils import ( determine_ext, int_or_none, ) class QuickVidIE(InfoExtractor): _VALID_URL = r'https?://(www\.)?quickvid\.org/watch\.php\?v=(?P<id>[a-zA-Z_0-9-]+)' _TEST = { 'url': 'http://quickvid.org/watch.php?v=sUQT3RCG8dx', 'md5': 'c0c72dd473f260c06c808a05d19acdc5', 'info_dict': { 'id': 'sUQT3RCG8dx', 'ext': 'mp4', 'title': 'Nick Offerman\'s Summer Reading Recap', 'thumbnail': 're:^https?://.*\.(?:png|jpg|gif)$', 'view_count': int, }, 'skip': 'Not accessible from Travis CI server', } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'<h2>(.*?)</h2>', webpage, 'title') view_count = int_or_none(self._html_search_regex( r'(?s)<div id="views">(.*?)</div>', webpage, 'view count', fatal=False)) video_code = self._search_regex( r'(?s)<video id="video"[^>]*>(.*?)</video>', webpage, 'video code') formats = [ { 'url': compat_urlparse.urljoin(url, src), 'format_id': determine_ext(src, None), } for src in re.findall('<source\s+src="([^"]+)"', video_code) ] self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), 'view_count': view_count, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/bleacherreport.py���������������������������������������������������0000644�0000000�0000000�00000010103�12641030331�021372� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from .amp import AMPIE from ..utils import ( ExtractorError, int_or_none, parse_iso8601, ) class BleacherReportIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)' _TESTS = [{ 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', 'info_dict': { 'id': '2496438', 'ext': 'mp4', 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', 'uploader_id': 3992341, 'description': 'CFB, ACC, Florida State', 'timestamp': 1434380212, 'upload_date': '20150615', 'uploader': 'Team Stream Now ', }, 'add_ie': ['Ooyala'], }, { 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', 'info_dict': { 'id': '2586817', 'ext': 'mp4', 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', 'timestamp': 1446839961, 'uploader': 'Sean Fay', 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', 'uploader_id': 6466954, 'upload_date': '20151011', }, 'add_ie': ['Youtube'], }] def _real_extract(self, url): article_id = self._match_id(url) article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] thumbnails = [] primary_photo = article_data.get('primaryPhoto') if primary_photo: thumbnails = [{ 'url': primary_photo['url'], 'width': primary_photo.get('width'), 'height': primary_photo.get('height'), }] info = { '_type': 'url_transparent', 'id': article_id, 'title': article_data['title'], 'uploader': article_data.get('author', {}).get('name'), 'uploader_id': article_data.get('authorId'), 'timestamp': parse_iso8601(article_data.get('createdAt')), 'thumbnails': thumbnails, 'comment_count': int_or_none(article_data.get('commentsCount')), 'view_count': int_or_none(article_data.get('hitCount')), } video = article_data.get('video') if video: video_type = video['type'] if video_type == 'cms.bleacherreport.com': info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] elif video_type == 'ooyala.com': info['url'] = 'ooyala:%s' % video['id'] elif video_type == 'youtube.com': info['url'] = video['id'] elif video_type == 'vine.co': info['url'] = 'https://vine.co/v/%s' % video['id'] else: info['url'] = video_type + video['id'] return info else: raise ExtractorError('no video in the article', expected=True) class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', 'md5': '8c2c12e3af7805152675446c905d159b', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', 'ext': 'flv', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', }, }] def _real_extract(self, url): video_id = self._match_id(url) info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) info['id'] = video_id return info �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/savefrom.py���������������������������������������������������������0000644�0000000�0000000�00000002153�12641030331�020221� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import os.path import re from .common import InfoExtractor class SaveFromIE(InfoExtractor): IE_NAME = 'savefrom.net' _VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P<url>.*)$' _TEST = { 'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com', 'info_dict': { 'id': 'UlVRAPW2WJY', 'ext': 'mp4', 'title': 'About Team Radical MMA | MMA Fighting', 'upload_date': '20120816', 'uploader': 'Howcast', 'uploader_id': 'Howcast', 'description': 're:(?s).* Hi, my name is Rene Dreifuss\. And I\'m here to show you some MMA.*', }, 'params': { 'skip_download': True } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = os.path.splitext(url.split('/')[-1])[0] return { '_type': 'url', 'id': video_id, 'url': mobj.group('url'), } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/giga.py�������������������������������������������������������������0000644�0000000�0000000�00000007353�12641030331�017315� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import itertools from .common import InfoExtractor from ..utils import ( qualities, compat_str, parse_duration, parse_iso8601, str_to_int, ) class GigaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?giga\.de/(?:[^/]+/)*(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.giga.de/filme/anime-awesome/trailer/anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss/', 'md5': '6bc5535e945e724640664632055a584f', 'info_dict': { 'id': '2622086', 'display_id': 'anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss', 'ext': 'mp4', 'title': 'Anime Awesome: Chihiros Reise ins Zauberland – Das Beste kommt zum Schluss', 'description': 'md5:afdf5862241aded4718a30dff6a57baf', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 578, 'timestamp': 1414749706, 'upload_date': '20141031', 'uploader': 'Robin Schweiger', 'view_count': int, }, }, { 'url': 'http://www.giga.de/games/channel/giga-top-montag/giga-topmontag-die-besten-serien-2014/', 'only_matching': True, }, { 'url': 'http://www.giga.de/extra/netzkultur/videos/giga-games-tom-mats-robin-werden-eigene-wege-gehen-eine-ankuendigung/', 'only_matching': True, }, { 'url': 'http://www.giga.de/tv/jonas-liest-spieletitel-eingedeutscht-episode-2/', 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex( [r'data-video-id="(\d+)"', r'/api/video/jwplayer/#v=(\d+)'], webpage, 'video id') playlist = self._download_json( 'http://www.giga.de/api/syndication/video/video_id/%s/playlist.json?content=syndication/key/368b5f151da4ae05ced7fa296bdff65a/' % video_id, video_id)[0] quality = qualities(['normal', 'hd720']) formats = [] for format_id in itertools.count(0): fmt = playlist.get(compat_str(format_id)) if not fmt: break formats.append({ 'url': fmt['src'], 'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]), 'quality': quality(fmt['quality']), }) self._sort_formats(formats) title = self._html_search_meta( 'title', webpage, 'title', fatal=True) description = self._html_search_meta( 'description', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) duration = parse_duration(self._search_regex( r'(?s)(?:data-video-id="{0}"|data-video="[^"]*/api/video/jwplayer/#v={0}[^"]*")[^>]*>.+?<span class="duration">([^<]+)</span>'.format(video_id), webpage, 'duration', fatal=False)) timestamp = parse_iso8601(self._search_regex( r'datetime="([^"]+)"', webpage, 'upload date', fatal=False)) uploader = self._search_regex( r'class="author">([^<]+)</a>', webpage, 'uploader', fatal=False) view_count = str_to_int(self._search_regex( r'<span class="views"><strong>([\d.,]+)</strong>', webpage, 'view count', fatal=False)) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'timestamp': timestamp, 'uploader': uploader, 'view_count': view_count, 'formats': formats, } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/ntvru.py������������������������������������������������������������0000644�0000000�0000000�00000011636�12641030331�017563� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( clean_html, xpath_text, int_or_none, ) class NTVRuIE(InfoExtractor): IE_NAME = 'ntv.ru' _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)' _TESTS = [ { 'url': 'http://www.ntv.ru/novosti/863142/', 'md5': 'ba7ea172a91cb83eb734cad18c10e723', 'info_dict': { 'id': '746000', 'ext': 'mp4', 'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', 'thumbnail': 're:^http://.*\.jpg', 'duration': 136, }, }, { 'url': 'http://www.ntv.ru/video/novosti/750370/', 'md5': 'adecff79691b4d71e25220a191477124', 'info_dict': { 'id': '750370', 'ext': 'mp4', 'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', 'thumbnail': 're:^http://.*\.jpg', 'duration': 172, }, }, { 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416', 'md5': '82dbd49b38e3af1d00df16acbeab260c', 'info_dict': { 'id': '747480', 'ext': 'mp4', 'title': '«Сегодня». 21 марта 2014 года. 16:00', 'description': '«Сегодня». 21 марта 2014 года. 16:00', 'thumbnail': 're:^http://.*\.jpg', 'duration': 1496, }, }, { 'url': 'http://www.ntv.ru/kino/Koma_film', 'md5': 'f825770930937aa7e5aca0dc0d29319a', 'info_dict': { 'id': '1007609', 'ext': 'mp4', 'title': 'Остросюжетный фильм «Кома»', 'description': 'Остросюжетный фильм «Кома»', 'thumbnail': 're:^http://.*\.jpg', 'duration': 5592, }, }, { 'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/', 'md5': '9320cd0e23f3ea59c330dc744e06ff3b', 'info_dict': { 'id': '751482', 'ext': 'mp4', 'title': '«Дело врачей»: «Деревце жизни»', 'description': '«Дело врачей»: «Деревце жизни»', 'thumbnail': 're:^http://.*\.jpg', 'duration': 2590, }, }, ] _VIDEO_ID_REGEXES = [ r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)', r'<video embed=[^>]+><id>(\d+)</id>', r'<video restriction[^>]+><key>(\d+)</key>', ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_id = self._html_search_regex(self._VIDEO_ID_REGEXES, webpage, 'video id') player = self._download_xml( 'http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML') title = clean_html(xpath_text(player, './data/title', 'title', fatal=True)) description = clean_html(xpath_text(player, './data/description', 'description')) video = player.find('./data/video') video_id = xpath_text(video, './id', 'video id') thumbnail = xpath_text(video, './splash', 'thumbnail') duration = int_or_none(xpath_text(video, './totaltime', 'duration')) view_count = int_or_none(xpath_text(video, './views', 'view count')) token = self._download_webpage( 'http://stat.ntv.ru/services/access/token', video_id, 'Downloading access token') formats = [] for format_id in ['', 'hi', 'webm']: file_ = video.find('./%sfile' % format_id) if file_ is None: continue size = video.find('./%ssize' % format_id) formats.append({ 'url': 'http://media2.ntv.ru/vod/%s&tok=%s' % (file_.text, token), 'filesize': int_or_none(size.text if size is not None else None), }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, 'formats': formats, } ��������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/bigflix.py����������������������������������������������������������0000644�0000000�0000000�00000005164�12645665720�020054� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import base64 import re from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote class BigflixIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.bigflix.com/Hindi-movies/Action-movies/Singham-Returns/16537', 'md5': 'ec76aa9b1129e2e5b301a474e54fab74', 'info_dict': { 'id': '16537', 'ext': 'mp4', 'title': 'Singham Returns', 'description': 'md5:3d2ba5815f14911d5cc6a501ae0cf65d', } }, { # 2 formats 'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070', 'info_dict': { 'id': '16070', 'ext': 'mp4', 'title': 'Madarasapatinam', 'description': 'md5:63b9b8ed79189c6f0418c26d9a3452ca', 'formats': 'mincount:2', }, 'params': { 'skip_download': True, } }, { # multiple formats 'url': 'http://www.bigflix.com/Malayalam-movies/Drama-movies/Indian-Rupee/15967', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( r'<div[^>]+class=["\']pagetitle["\'][^>]*>(.+?)</div>', webpage, 'title') def decode_url(quoted_b64_url): return base64.b64decode(compat_urllib_parse_unquote( quoted_b64_url).encode('ascii')).decode('utf-8') formats = [] for height, encoded_url in re.findall( r'ContentURL_(\d{3,4})[pP][^=]+=([^&]+)', webpage): video_url = decode_url(encoded_url) f = { 'url': video_url, 'format_id': '%sp' % height, 'height': int(height), } if video_url.startswith('rtmp'): f['ext'] = 'flv' formats.append(f) file_url = self._search_regex( r'file=([^&]+)', webpage, 'video url', default=None) if file_url: video_url = decode_url(file_url) if all(f['url'] != video_url for f in formats): formats.append({ 'url': decode_url(file_url), }) self._sort_formats(formats) description = self._html_search_meta('description', webpage) return { 'id': video_id, 'title': title, 'description': description, 'formats': formats } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/dailymotion.py������������������������������������������������������0000644�0000000�0000000�00000040346�12662061715�020752� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re import json import itertools from .common import InfoExtractor from ..utils import ( determine_ext, error_to_compat_str, ExtractorError, int_or_none, parse_iso8601, sanitized_Request, str_to_int, unescapeHTML, ) class DailymotionBaseInfoExtractor(InfoExtractor): @staticmethod def _build_request(url): """Build a request with the family filter disabled""" request = sanitized_Request(url) request.add_header('Cookie', 'family_filter=off; ff=off') return request def _download_webpage_handle_no_ff(self, url, *args, **kwargs): request = self._build_request(url) return self._download_webpage_handle(request, *args, **kwargs) def _download_webpage_no_ff(self, url, *args, **kwargs): request = self._build_request(url) return self._download_webpage(request, *args, **kwargs) class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)' IE_NAME = 'dailymotion' _FORMATS = [ ('stream_h264_ld_url', 'ld'), ('stream_h264_url', 'standard'), ('stream_h264_hq_url', 'hq'), ('stream_h264_hd_url', 'hd'), ('stream_h264_hd1080_url', 'hd180'), ] _TESTS = [ { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', 'md5': '2137c41a8e78554bb09225b8eb322406', 'info_dict': { 'id': 'x2iuewm', 'ext': 'mp4', 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', 'description': 'Several come bundled with the Steam Controller.', 'thumbnail': 're:^https?:.*\.(?:jpg|png)$', 'duration': 74, 'timestamp': 1425657362, 'upload_date': '20150306', 'uploader': 'IGN', 'uploader_id': 'xijv66', 'age_limit': 0, 'view_count': int, 'comment_count': int, } }, # Vevo video { 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', 'info_dict': { 'title': 'Roar (Official)', 'id': 'USUV71301934', 'ext': 'mp4', 'uploader': 'Katy Perry', 'upload_date': '20130905', }, 'params': { 'skip_download': True, }, 'skip': 'VEVO is only available in some countries', }, # age-restricted video { 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', 'md5': '0d667a7b9cebecc3c89ee93099c4159d', 'info_dict': { 'id': 'xyh2zz', 'ext': 'mp4', 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', 'uploader': 'HotWaves1012', 'age_limit': 18, } }, # geo-restricted, player v5 { 'url': 'http://www.dailymotion.com/video/xhza0o', 'only_matching': True, }, # with subtitles { 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', 'only_matching': True, }, { 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', 'only_matching': True, } ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage_no_ff( 'https://www.dailymotion.com/video/%s' % video_id, video_id) age_limit = self._rta_search(webpage) description = self._og_search_description(webpage) or self._html_search_meta( 'description', webpage, 'description') view_count_str = self._search_regex( (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"', r'video_views_count[^>]+>\s+([\s\d\,.]+)'), webpage, 'view count', fatal=False) if view_count_str: view_count_str = re.sub(r'\s', '', view_count_str) view_count = str_to_int(view_count_str) comment_count = int_or_none(self._search_regex( r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"', webpage, 'comment count', fatal=False)) player_v5 = self._search_regex( [r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826 r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', r'buildPlayer\(({.+?})\);'], webpage, 'player v5', default=None) if player_v5: player = self._parse_json(player_v5, video_id) metadata = player['metadata'] self._check_error(metadata) formats = [] for quality, media_list in metadata['qualities'].items(): for media in media_list: media_url = media.get('url') if not media_url: continue type_ = media.get('type') if type_ == 'application/vnd.lumberjack.manifest': continue ext = determine_ext(media_url) if type_ == 'application/x-mpegURL' or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', preference=-1, m3u8_id='hls', fatal=False)) elif type_ == 'application/f4m' or ext == 'f4m': formats.extend(self._extract_f4m_formats( media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) else: f = { 'url': media_url, 'format_id': 'http-%s' % quality, } m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url) if m: f.update({ 'width': int(m.group('width')), 'height': int(m.group('height')), }) formats.append(f) self._sort_formats(formats) title = metadata['title'] duration = int_or_none(metadata.get('duration')) timestamp = int_or_none(metadata.get('created_time')) thumbnail = metadata.get('poster_url') uploader = metadata.get('owner', {}).get('screenname') uploader_id = metadata.get('owner', {}).get('id') subtitles = {} subtitles_data = metadata.get('subtitles', {}).get('data', {}) if subtitles_data and isinstance(subtitles_data, dict): for subtitle_lang, subtitle in subtitles_data.items(): subtitles[subtitle_lang] = [{ 'ext': determine_ext(subtitle_url), 'url': subtitle_url, } for subtitle_url in subtitle.get('urls', [])] return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'timestamp': timestamp, 'uploader': uploader, 'uploader_id': uploader_id, 'age_limit': age_limit, 'view_count': view_count, 'comment_count': comment_count, 'formats': formats, 'subtitles': subtitles, } # vevo embed vevo_id = self._search_regex( r'<link rel="video_src" href="[^"]*?vevo.com[^"]*?video=(?P<id>[\w]*)', webpage, 'vevo embed', default=None) if vevo_id: return self.url_result('vevo:%s' % vevo_id, 'Vevo') # fallback old player embed_page = self._download_webpage_no_ff( 'https://www.dailymotion.com/embed/video/%s' % video_id, video_id, 'Downloading embed page') timestamp = parse_iso8601(self._html_search_meta( 'video:release_date', webpage, 'upload date')) info = self._parse_json( self._search_regex( r'var info = ({.*?}),$', embed_page, 'video info', flags=re.MULTILINE), video_id) self._check_error(info) formats = [] for (key, format_id) in self._FORMATS: video_url = info.get(key) if video_url is not None: m_size = re.search(r'H264-(\d+)x(\d+)', video_url) if m_size is not None: width, height = map(int_or_none, (m_size.group(1), m_size.group(2))) else: width, height = None, None formats.append({ 'url': video_url, 'ext': 'mp4', 'format_id': format_id, 'width': width, 'height': height, }) self._sort_formats(formats) # subtitles video_subtitles = self.extract_subtitles(video_id, webpage) title = self._og_search_title(webpage, default=None) if title is None: title = self._html_search_regex( r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage, 'title') return { 'id': video_id, 'formats': formats, 'uploader': info['owner.screenname'], 'timestamp': timestamp, 'title': title, 'description': description, 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'], 'age_limit': age_limit, 'view_count': view_count, 'duration': info['duration'] } def _check_error(self, info): if info.get('error') is not None: raise ExtractorError( '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True) def _get_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, video_id, note=False) except ExtractorError as err: self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) return {} info = json.loads(sub_list) if (info['total'] > 0): sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list']) return sub_lang_list self._downloader.report_warning('video doesn\'t have subtitles') return {} class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/' _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' _TESTS = [{ 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', 'info_dict': { 'title': 'SPORT', 'id': 'xv4bw_nqtv_sport', }, 'playlist_mincount': 20, }] def _extract_entries(self, id): video_ids = set() processed_urls = set() for pagenum in itertools.count(1): page_url = self._PAGE_TEMPLATE % (id, pagenum) webpage, urlh = self._download_webpage_handle_no_ff( page_url, id, 'Downloading page %s' % pagenum) if urlh.geturl() in processed_urls: self.report_warning('Stopped at duplicated page %s, which is the same as %s' % ( page_url, urlh.geturl()), id) break processed_urls.add(urlh.geturl()) for video_id in re.findall(r'data-xid="(.+?)"', webpage): if video_id not in video_ids: yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') video_ids.add(video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: break def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') webpage = self._download_webpage(url, playlist_id) return { '_type': 'playlist', 'id': playlist_id, 'title': self._og_search_title(webpage), 'entries': self._extract_entries(playlist_id), } class DailymotionUserIE(DailymotionPlaylistIE): IE_NAME = 'dailymotion:user' _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', 'info_dict': { 'id': 'nqtv', 'title': 'Rémi Gaillard', }, 'playlist_mincount': 100, }, { 'url': 'http://www.dailymotion.com/user/UnderProject', 'info_dict': { 'id': 'UnderProject', 'title': 'UnderProject', }, 'playlist_mincount': 1800, 'expected_warnings': [ 'Stopped at duplicated page', ], 'skip': 'Takes too long time', }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user = mobj.group('user') webpage = self._download_webpage( 'https://www.dailymotion.com/user/%s' % user, user) full_user = unescapeHTML(self._html_search_regex( r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user), webpage, 'user')) return { '_type': 'playlist', 'id': user, 'title': full_user, 'entries': self._extract_entries(user), } class DailymotionCloudIE(DailymotionBaseInfoExtractor): _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/' _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX _TESTS = [{ # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html # Tested at FranceTvInfo_2 'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1', 'only_matching': True, }, { # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html 'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1', 'only_matching': True, }] @classmethod def _extract_dmcloud_url(cls, webpage): mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL, webpage) if mobj: return mobj.group(1) mobj = re.search( r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL, webpage) if mobj: return mobj.group(1) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage_no_ff(url, video_id) title = self._html_search_regex(r'<title>([^>]+)', webpage, 'title') video_info = self._parse_json(self._search_regex( r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id) # TODO: parse ios_url, which is in fact a manifest video_url = video_info['mp4_url'] return { 'id': video_id, 'url': video_url, 'title': title, 'thumbnail': video_info.get('thumbnail_url'), } youtube-dl/youtube_dl/extractor/weibo.py0000644000000000000000000000337212641030331017510 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class WeiboIE(InfoExtractor): """ The videos in Weibo come from different sites, this IE just finds the link to the external video and returns it. """ _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P.+?)\.htm' _TEST = { 'url': 'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm', 'info_dict': { 'id': '98322879', 'ext': 'flv', 'title': '魔声耳机最新广告“All Eyes On Us”', }, 'params': { 'skip_download': True, }, 'add_ie': ['Sina'], } # Additional example videos from different sites # Youku: http://video.weibo.com/v/weishipin/t_zQGDWQ8.htm # 56.com: http://video.weibo.com/v/weishipin/t_zQ44HxN.htm def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) video_id = mobj.group('id') info_url = 'http://video.weibo.com/?s=v&a=play_list&format=json&mix_video_id=t_%s' % video_id info = self._download_json(info_url, video_id) videos_urls = map(lambda v: v['play_page_url'], info['result']['data']) # Prefer sina video since they have thumbnails videos_urls = sorted(videos_urls, key=lambda u: 'video.sina.com' in u) player_url = videos_urls[-1] m_sina = re.match(r'https?://video\.sina\.com\.cn/v/b/(\d+)-\d+\.html', player_url) if m_sina is not None: self.to_screen('Sina video detected') sina_id = m_sina.group(1) player_url = 'http://you.video.sina.com.cn/swf/quotePlayer.swf?vid=%s' % sina_id return self.url_result(player_url) youtube-dl/youtube_dl/extractor/comedycentral.py0000644000000000000000000002731112660177411021247 0ustar rootrootfrom __future__ import unicode_literals import re from .mtv import MTVServicesInfoExtractor from ..compat import ( compat_str, compat_urllib_parse, ) from ..utils import ( ExtractorError, float_or_none, unified_strdate, ) class ComedyCentralIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ (video-clips|episodes|cc-studios|video-collections|full-episodes|shows) /(?P.*)''' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', 'info_dict': { 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', 'ext': 'mp4', 'title': 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother', 'description': 'After a certain point, breastfeeding becomes c**kblocking.', }, }, { 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', 'only_matching': True, }] class ComedyCentralShowsIE(MTVServicesInfoExtractor): IE_DESC = 'The Daily Show / The Colbert Report' # urls can be abbreviations like :thedailyshow # urls for episodes like: # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow) |https?://(:www\.)? (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/ ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)| (?P<clip> (?:(?:guests/[^/]+|videos|video-playlists|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+)) |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)) )| (?P<interview> extended-interviews/(?P<interID>[0-9a-z]+)/ (?:playlist_tds_extended_)?(?P<interview_title>[^/?#]*?) (?:/[^/?#]?|[?#]|$)))) ''' _TESTS = [{ 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', 'md5': '4e2f5cb088a83cd8cdb7756132f9739d', 'info_dict': { 'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55', 'ext': 'mp4', 'upload_date': '20121213', 'description': 'Kristen Stewart learns to let loose in "On the Road."', 'uploader': 'thedailyshow', 'title': 'thedailyshow kristen-stewart part 1', } }, { 'url': 'http://thedailyshow.cc.com/extended-interviews/b6364d/sarah-chayes-extended-interview', 'info_dict': { 'id': 'sarah-chayes-extended-interview', 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', 'title': 'thedailyshow Sarah Chayes Extended Interview', }, 'playlist': [ { 'info_dict': { 'id': '0baad492-cbec-4ec1-9e50-ad91c291127f', 'ext': 'mp4', 'upload_date': '20150129', 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', 'uploader': 'thedailyshow', 'title': 'thedailyshow sarah-chayes-extended-interview part 1', }, }, { 'info_dict': { 'id': '1e4fb91b-8ce7-4277-bd7c-98c9f1bbd283', 'ext': 'mp4', 'upload_date': '20150129', 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', 'uploader': 'thedailyshow', 'title': 'thedailyshow sarah-chayes-extended-interview part 2', }, }, ], 'params': { 'skip_download': True, }, }, { 'url': 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', 'only_matching': True, }, { 'url': 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news', 'only_matching': True, }, { 'url': 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114', 'only_matching': True, }, { 'url': 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3', 'only_matching': True, }, { 'url': 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary', 'only_matching': True, }, { 'url': 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall', 'only_matching': True, }, { 'url': 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights', 'only_matching': True, }, { 'url': 'http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo', 'only_matching': True, }, { 'url': 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food', 'only_matching': True, }, { 'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel', 'only_matching': True, }] _available_formats = ['3500', '2200', '1700', '1200', '750', '400'] _video_extensions = { '3500': 'mp4', '2200': 'mp4', '1700': 'mp4', '1200': 'mp4', '750': 'mp4', '400': 'mp4', } _video_dimensions = { '3500': (1280, 720), '2200': (960, 540), '1700': (768, 432), '1200': (640, 360), '750': (512, 288), '400': (384, 216), } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj.group('shortname'): return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes') if mobj.group('clip'): if mobj.group('videotitle'): epTitle = mobj.group('videotitle') elif mobj.group('showname') == 'thedailyshow': epTitle = mobj.group('tdstitle') else: epTitle = mobj.group('cntitle') dlNewest = False elif mobj.group('interview'): epTitle = mobj.group('interview_title') dlNewest = False else: dlNewest = not mobj.group('episode') if dlNewest: epTitle = mobj.group('showname') else: epTitle = mobj.group('episode') show_name = mobj.group('showname') webpage, htmlHandle = self._download_webpage_handle(url, epTitle) if dlNewest: url = htmlHandle.geturl() mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: raise ExtractorError('Invalid redirected URL: ' + url) if mobj.group('episode') == '': raise ExtractorError('Redirected URL is still not specific: ' + url) epTitle = (mobj.group('episode') or mobj.group('videotitle')).rpartition('/')[-1] mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage) if len(mMovieParams) == 0: # The Colbert Report embeds the information in a without # a URL prefix; so extract the alternate reference # and then add the URL prefix manually. altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage) if len(altMovieParams) == 0: raise ExtractorError('unable to find Flash URL in webpage ' + url) else: mMovieParams = [('http://media.mtvnservices.com/' + altMovieParams[0], altMovieParams[0])] uri = mMovieParams[0][1] # Correct cc.com in uri uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri) index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri})) idoc = self._download_xml( index_url, epTitle, 'Downloading show index', 'Unable to download episode index') title = idoc.find('./channel/title').text description = idoc.find('./channel/description').text entries = [] item_els = idoc.findall('.//item') for part_num, itemEl in enumerate(item_els): upload_date = unified_strdate(itemEl.findall('./pubDate')[0].text) thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url') content = itemEl.find('.//{http://search.yahoo.com/mrss/}content') duration = float_or_none(content.attrib.get('duration')) mediagen_url = content.attrib['url'] guid = itemEl.find('./guid').text.rpartition(':')[-1] cdoc = self._download_xml( mediagen_url, epTitle, 'Downloading configuration for segment %d / %d' % (part_num + 1, len(item_els))) turls = [] for rendition in cdoc.findall('.//rendition'): finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text) turls.append(finfo) formats = [] for format, rtmp_video_url in turls: w, h = self._video_dimensions.get(format, (None, None)) formats.append({ 'format_id': 'vhttp-%s' % format, 'url': self._transform_rtmp_url(rtmp_video_url), 'ext': self._video_extensions.get(format, 'mp4'), 'height': h, 'width': w, }) formats.append({ 'format_id': 'rtmp-%s' % format, 'url': rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm'), 'ext': self._video_extensions.get(format, 'mp4'), 'height': h, 'width': w, }) self._sort_formats(formats) subtitles = self._extract_subtitles(cdoc, guid) virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1) entries.append({ 'id': guid, 'title': virtual_id, 'formats': formats, 'uploader': show_name, 'upload_date': upload_date, 'duration': duration, 'thumbnail': thumbnail, 'description': description, 'subtitles': subtitles, }) return { '_type': 'playlist', 'id': epTitle, 'entries': entries, 'title': show_name + ' ' + title, 'description': description, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/myvi.py�������������������������������������������������������������0000644�0000000�0000000�00000004305�12641030331�017364� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .vimple import SprutoBaseIE class MyviIE(SprutoBaseIE): _VALID_URL = r'''(?x) https?:// myvi\.(?:ru/player|tv)/ (?: (?: embed/html| flash| api/Video/Get )/| content/preloader\.swf\?.*\bid= ) (?P<id>[\da-zA-Z_-]+) ''' _TESTS = [{ 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', 'md5': '571bbdfba9f9ed229dc6d34cc0f335bf', 'info_dict': { 'id': 'f16b2bbd-cde8-481c-a981-7cd48605df43', 'ext': 'mp4', 'title': 'хозяин жизни', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 25, }, }, { 'url': 'http://myvi.ru/player/content/preloader.swf?id=oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wOYf1WFpPfc_bWTKGVf_Zafr0', 'only_matching': True, }, { 'url': 'http://myvi.ru/player/api/Video/Get/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', 'only_matching': True, }, { 'url': 'http://myvi.tv/embed/html/oTGTNWdyz4Zwy_u1nraolwZ1odenTd9WkTnRfIL9y8VOgHYqOHApE575x4_xxS9Vn0?ap=0', 'only_matching': True, }, { 'url': 'http://myvi.ru/player/flash/ocp2qZrHI-eZnHKQBK4cZV60hslH8LALnk0uBfKsB-Q4WnY26SeGoYPi8HWHxu0O30', 'only_matching': True, }] @classmethod def _extract_url(cls, webpage): mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1', webpage) if mobj: return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) spruto = self._download_json( 'http://myvi.ru/player/api/Video/Get/%s?sig' % video_id, video_id)['sprutoData'] return self._extract_spruto(spruto, video_id) ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/howcast.py����������������������������������������������������������0000644�0000000�0000000�00000002530�12641030331�020046� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..utils import parse_iso8601 class HowcastIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)' _TEST = { 'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly', 'md5': '8b743df908c42f60cf6496586c7f12c3', 'info_dict': { 'id': '390161', 'ext': 'mp4', 'title': 'How to Tie a Square Knot Properly', 'description': 'md5:dbe792e5f6f1489027027bf2eba188a3', 'timestamp': 1276081287, 'upload_date': '20100609', 'duration': 56.823, }, 'params': { # m3u8 download 'skip_download': True, }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) embed_code = self._search_regex( r'<iframe[^>]+src="[^"]+\bembed_code=([^\b]+)\b', webpage, 'ooyala embed code') return { '_type': 'url_transparent', 'ie_key': 'Ooyala', 'url': 'ooyala:%s' % embed_code, 'id': video_id, 'timestamp': parse_iso8601(self._html_search_meta( 'article:published_time', webpage, 'timestamp')), } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/internetvideoarchive.py���������������������������������������������0000644�0000000�0000000�00000006623�12641030331�022626� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urlparse, compat_urllib_parse, ) from ..utils import ( xpath_with_ns, ) class InternetVideoArchiveIE(InfoExtractor): _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' _TEST = { 'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', 'info_dict': { 'id': '452693', 'ext': 'mp4', 'title': 'SKYFALL', 'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', 'duration': 152, }, } @staticmethod def _build_url(query): return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query @staticmethod def _clean_query(query): NEEDED_ARGS = ['publishedid', 'customerid'] query_dic = compat_urlparse.parse_qs(query) cleaned_dic = dict((k, v[0]) for (k, v) in query_dic.items() if k in NEEDED_ARGS) # Other player ids return m3u8 urls cleaned_dic['playerid'] = '247' cleaned_dic['videokbrate'] = '100000' return compat_urllib_parse.urlencode(cleaned_dic) def _real_extract(self, url): query = compat_urlparse.urlparse(url).query query_dic = compat_urlparse.parse_qs(query) video_id = query_dic['publishedid'][0] url = self._build_url(query) flashconfiguration = self._download_xml(url, video_id, 'Downloading flash configuration') file_url = flashconfiguration.find('file').text file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') # Replace some of the parameters in the query to get the best quality # and http links (no m3u8 manifests) file_url = re.sub(r'(?<=\?)(.+)$', lambda m: self._clean_query(m.group()), file_url) info = self._download_xml(file_url, video_id, 'Downloading video info') item = info.find('channel/item') def _bp(p): return xpath_with_ns( p, { 'media': 'http://search.yahoo.com/mrss/', 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats', } ) formats = [] for content in item.findall(_bp('media:group/media:content')): attr = content.attrib f_url = attr['url'] width = int(attr['width']) bitrate = int(attr['bitrate']) format_id = '%d-%dk' % (width, bitrate) formats.append({ 'format_id': format_id, 'url': f_url, 'width': width, 'tbr': bitrate, }) self._sort_formats(formats) return { 'id': video_id, 'title': item.find('title').text, 'formats': formats, 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'], 'description': item.find('description').text, 'duration': int(attr['duration']), } �������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/facebook.py���������������������������������������������������������0000644�0000000�0000000�00000025540�12662061715�020172� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import json import re import socket from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, compat_http_client, compat_urllib_error, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, ) from ..utils import ( error_to_compat_str, ExtractorError, limit_length, sanitized_Request, urlencode_postdata, get_element_by_id, clean_html, ) class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?:// (?:\w+\.)?facebook\.com/ (?:[^#]*?\#!/)? (?: (?: video/video\.php| photo\.php| video\.php| video/embed )\?(?:.*?)(?:v|video_id)=| [^/]+/videos/(?:[^/]+/)? )| facebook: ) (?P<id>[0-9]+) ''' _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' IE_NAME = 'facebook' _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'md5': '6a40d33c0eccbb1af76cf0485a052659', 'info_dict': { 'id': '637842556329505', 'ext': 'mp4', 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', 'uploader': 'Tennis on Facebook', } }, { 'note': 'Video without discernible title', 'url': 'https://www.facebook.com/video.php?v=274175099429670', 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', 'title': 'Facebook video #274175099429670', 'uploader': 'Asif Nawab Butt', }, 'expected_warnings': [ 'title' ] }, { 'note': 'Video with DASH manifest', 'url': 'https://www.facebook.com/video.php?v=957955867617029', 'md5': '54706e4db4f5ad58fbad82dde1f1213f', 'info_dict': { 'id': '957955867617029', 'ext': 'mp4', 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', 'uploader': 'Demy de Zeeuw', }, }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, }, { 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', 'only_matching': True, }, { 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', 'only_matching': True, }, { 'url': 'facebook:544765982287235', 'only_matching': True, }] def _login(self): (useremail, password) = self._get_login_info() if useremail is None: return login_page_req = sanitized_Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') login_page = self._download_webpage(login_page_req, None, note='Downloading login page', errnote='Unable to download login page') lsd = self._search_regex( r'<input type="hidden" name="lsd" value="([^"]*)"', login_page, 'lsd') lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd') login_form = { 'email': useremail, 'pass': password, 'lsd': lsd, 'lgnrnd': lgnrnd, 'next': 'http://facebook.com/home.php', 'default_persistent': '0', 'legacy_return': '1', 'timezone': '-60', 'trynum': '1', } request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') try: login_results = self._download_webpage(request, None, note='Logging in', errnote='unable to fetch login page') if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None: error = self._html_search_regex( r'(?s)<div[^>]+class=(["\']).*?login_error_box.*?\1[^>]*><div[^>]*>.*?</div><div[^>]*>(?P<error>.+?)</div>', login_results, 'login error', default=None, group='error') if error: raise ExtractorError('Unable to login: %s' % error, expected=True) self._downloader.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.') return fb_dtsg = self._search_regex( r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg', default=None) h = self._search_regex( r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h', default=None) if not fb_dtsg or not h: return check_form = { 'fb_dtsg': fb_dtsg, 'h': h, 'name_action_selected': 'dont_save', } check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') check_response = self._download_webpage(check_req, None, note='Confirming login') if re.search(r'id="checkpointSubmitButton"', check_response) is not None: self._downloader.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err)) return def _real_initialize(self): self._login() def _real_extract(self, url): video_id = self._match_id(url) req = sanitized_Request('https://www.facebook.com/video/video.php?v=%s' % video_id) req.add_header('User-Agent', self._CHROME_USER_AGENT) webpage = self._download_webpage(req, video_id) video_data = None BEFORE = '{swf.addParam(param[0], param[1]);});\n' AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage) if m: data = dict(json.loads(m.group(1))) params_raw = compat_urllib_parse_unquote(data['params']) video_data = json.loads(params_raw)['video_data'] def video_data_list2dict(video_data): ret = {} for item in video_data: format_id = item['stream_type'] ret.setdefault(format_id, []).append(item) return ret if not video_data: server_js_data = self._parse_json(self._search_regex( r'handleServerJS\(({.+})\);', webpage, 'server js data'), video_id) for item in server_js_data.get('instances', []): if item[1][0] == 'VideoConfig': video_data = video_data_list2dict(item[2][0]['videoData']) break if not video_data: m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) if m_msg is not None: raise ExtractorError( 'The video is not available, Facebook said: "%s"' % m_msg.group(1), expected=True) else: raise ExtractorError('Cannot parse data') formats = [] for format_id, f in video_data.items(): if not f or not isinstance(f, list): continue for quality in ('sd', 'hd'): for src_type in ('src', 'src_no_ratelimit'): src = f[0].get('%s_%s' % (quality, src_type)) if src: formats.append({ 'format_id': '%s_%s_%s' % (format_id, quality, src_type), 'url': src, 'preference': -10 if format_id == 'progressive' else 0, }) dash_manifest = f[0].get('dash_manifest') if dash_manifest: formats.extend(self._parse_mpd_formats( compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) if not formats: raise ExtractorError('Cannot find video formats') self._sort_formats(formats) video_title = self._html_search_regex( r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title', default=None) if not video_title: video_title = self._html_search_regex( r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', webpage, 'alternative title', default=None) video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) return { 'id': video_id, 'title': video_title, 'formats': formats, 'uploader': uploader, } class FacebookPostIE(InfoExtractor): IE_NAME = 'facebook:post' _VALID_URL = r'https?://(?:\w+\.)?facebook\.com/[^/]+/posts/(?P<id>\d+)' _TEST = { 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', 'info_dict': { 'id': '544765982287235', 'ext': 'mp4', 'title': '"What are you doing running in the snow?"', 'uploader': 'FailArmy', } } def _real_extract(self, url): post_id = self._match_id(url) webpage = self._download_webpage(url, post_id) entries = [ self.url_result('facebook:%s' % video_id, FacebookIE.ie_key()) for video_id in self._parse_json( self._search_regex( r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', webpage, 'video ids', group='ids'), post_id)] return self.playlist_result(entries, post_id) ����������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/fktv.py�������������������������������������������������������������0000644�0000000�0000000�00000003025�12641030331�017350� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( clean_html, determine_ext, js_to_json, ) class FKTVIE(InfoExtractor): IE_NAME = 'fernsehkritik.tv' _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?' _TEST = { 'url': 'http://fernsehkritik.tv/folge-1', 'md5': '21f0b0c99bce7d5b524eb1b17b1c6d79', 'info_dict': { 'id': '1', 'ext': 'mp4', 'title': 'Folge 1 vom 10. April 2007', 'thumbnail': 're:^https?://.*\.jpg$', }, } def _real_extract(self, url): episode = self._match_id(url) webpage = self._download_webpage( 'http://fernsehkritik.tv/folge-%s/play' % episode, episode) title = clean_html(self._html_search_regex( '<h3>([^<]+)</h3>', webpage, 'title')) thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) formats = [] for source in sources: furl = source.get('src') if furl: formats.append({ 'url': furl, 'format_id': determine_ext(furl), }) self._sort_formats(formats) return { 'id': episode, 'title': title, 'formats': formats, 'thumbnail': thumbnail, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/wayofthemaster.py���������������������������������������������������0000644�0000000�0000000�00000002767�12641030331�021454� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor class WayOfTheMasterIE(InfoExtractor): _VALID_URL = r'https?://www\.wayofthemaster\.com/([^/?#]*/)*(?P<id>[^/?#]+)\.s?html(?:$|[?#])' _TEST = { 'url': 'http://www.wayofthemaster.com/hbks.shtml', 'md5': '5316b57487ada8480606a93cb3d18d24', 'info_dict': { 'id': 'hbks', 'ext': 'mp4', 'title': 'Intelligent Design vs. Evolution', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) title = self._search_regex( r'<img src="images/title_[^"]+".*?alt="([^"]+)"', webpage, 'title', default=None) if title is None: title = self._html_search_regex( r'<title>(.*?)', webpage, 'page title') url_base = self._search_regex( r'[^/?#]+)' _FEED_URL = 'http://lite.dextr.mtvi.com/service1/dispatch.htm' _TESTS = [{ 'url': 'http://www.nextmovie.com/shows/exclusives/2013-03-10/mgid:uma:videolist:nextmovie.com:1715019/', 'md5': '09a9199f2f11f10107d04fcb153218aa', 'info_dict': { 'id': '961726', 'ext': 'mp4', 'title': 'The Muppets\' Gravity', }, }] def _get_feed_query(self, uri): return compat_urllib_parse.urlencode({ 'feed': '1505', 'mgid': uri, }) def _real_extract(self, url): mgid = self._match_id(url) return self._get_videos_info(mgid) youtube-dl/youtube_dl/extractor/fczenit.py0000644000000000000000000000230312641030331020036 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class FczenitIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P[0-9]+)' _TEST = { 'url': 'http://fc-zenit.ru/video/gl6785/', 'md5': '458bacc24549173fe5a5aa29174a5606', 'info_dict': { 'id': '6785', 'ext': 'mp4', 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_title = self._html_search_regex(r'
    ([^<]+)', webpage, 'title') bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) formats = [{ 'url': furl, 'tbr': tbr, } for furl, tbr in bitrates] self._sort_formats(formats) return { 'id': video_id, 'title': video_title, 'formats': formats, } youtube-dl/youtube_dl/extractor/ubu.py0000644000000000000000000000326712641030331017201 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, qualities, ) class UbuIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?ubu\.com/film/(?P[\da-z_-]+)\.html' _TEST = { 'url': 'http://ubu.com/film/her_noise.html', 'md5': '138d5652618bf0f03878978db9bef1ee', 'info_dict': { 'id': 'her_noise', 'ext': 'm4v', 'title': 'Her Noise - The Making Of (2007)', 'duration': 3600, }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( r'.+?Film & Video: ([^<]+)', webpage, 'title') duration = int_or_none(self._html_search_regex( r'Duration: (\d+) minutes', webpage, 'duration', fatal=False), invscale=60) formats = [] FORMAT_REGEXES = [ ('sq', r"'flashvars'\s*,\s*'file=([^']+)'"), ('hq', r'href="(http://ubumexico\.centro\.org\.mx/video/[^"]+)"'), ] preference = qualities([fid for fid, _ in FORMAT_REGEXES]) for format_id, format_regex in FORMAT_REGEXES: m = re.search(format_regex, webpage) if m: formats.append({ 'url': m.group(1), 'format_id': format_id, 'preference': preference(format_id), }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'duration': duration, 'formats': formats, } youtube-dl/youtube_dl/extractor/nova.py0000644000000000000000000001560412641030331017347 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( clean_html, unified_strdate, ) class NovaIE(InfoExtractor): IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' _VALID_URL = 'http://(?:[^.]+\.)?(?Ptv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P[^/]+?)(?:\.html|/|$)' _TESTS = [{ 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus', 'info_dict': { 'id': '1608920', 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou', 'ext': 'flv', 'title': 'Duel: Michal Hrdlička a Petr Suchoň', 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5', 'thumbnail': 're:^https?://.*\.(?:jpg)', }, 'params': { # rtmp download 'skip_download': True, } }, { 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', 'info_dict': { 'id': '1757139', 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', 'ext': 'mp4', 'title': 'Podzemní nemocnice v pražské Krči', 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', 'thumbnail': 're:^https?://.*\.(?:jpg)', } }, { 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove', 'info_dict': { 'id': '1756825', 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', 'ext': 'flv', 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags 'thumbnail': 're:^https?://.*\.(?:jpg)', }, 'params': { # rtmp download 'skip_download': True, } }, { 'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/', 'info_dict': { 'id': '1756858', 'ext': 'flv', 'title': 'Televizní noviny - 30. 5. 2015', 'thumbnail': 're:^https?://.*\.(?:jpg)', 'upload_date': '20150530', }, 'params': { # rtmp download 'skip_download': True, } }, { 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', 'info_dict': { 'id': '1753621', 'ext': 'mp4', 'title': 'Zaklínač 3: Divoký hon', 'description': 're:.*Pokud se stejně jako my nemůžete.*', 'thumbnail': 're:https?://.*\.jpg(\?.*)?', 'upload_date': '20150521', }, 'params': { # rtmp download 'skip_download': True, } }, { 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', 'only_matching': True, }, { 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', 'only_matching': True, }, { 'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html', 'only_matching': True, }, { 'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html', 'only_matching': True, }, { 'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') site = mobj.group('site') webpage = self._download_webpage(url, display_id) video_id = self._search_regex( [r"(?:media|video_id)\s*:\s*'(\d+)'", r'media=(\d+)', r'id="article_video_(\d+)"', r'id="player_(\d+)"'], webpage, 'video id') config_url = self._search_regex( r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', webpage, 'config url', default=None) if not config_url: DEFAULT_SITE_ID = '23000' SITES = { 'tvnoviny': DEFAULT_SITE_ID, 'novaplus': DEFAULT_SITE_ID, 'vymena': DEFAULT_SITE_ID, 'krasna': DEFAULT_SITE_ID, 'fanda': '30', 'tn': '30', 'doma': '30', } site_id = self._search_regex( r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID) config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig' % (site_id, video_id)) config = self._download_json( config_url, display_id, 'Downloading config JSON', transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) mediafile = config['mediafile'] video_url = mediafile['src'] m = re.search(r'^(?Prtmpe?://[^/]+/(?P[^/]+?))/&*(?P.+)$', video_url) if m: formats = [{ 'url': m.group('url'), 'app': m.group('app'), 'play_path': m.group('playpath'), 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf', 'ext': 'flv', }] else: formats = [{ 'url': video_url, }] self._sort_formats(formats) title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) description = clean_html(self._og_search_description(webpage, default=None)) thumbnail = config.get('poster') if site == 'novaplus': upload_date = unified_strdate(self._search_regex( r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) elif site == 'fanda': upload_date = unified_strdate(self._search_regex( r'(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) else: upload_date = None return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'upload_date': upload_date, 'thumbnail': thumbnail, 'formats': formats, } youtube-dl/youtube_dl/extractor/appleconnect.py0000644000000000000000000000347012641030331021055 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( str_to_int, ExtractorError ) class AppleConnectIE(InfoExtractor): _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P[\w-]+)' _TEST = { 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', 'md5': '10d0f2799111df4cb1c924520ca78f98', 'info_dict': { 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', 'ext': 'm4v', 'title': 'Energy', 'uploader': 'Drake', 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg', 'upload_date': '20150710', 'timestamp': 1436545535, }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) try: video_json = self._html_search_regex( r'class="auc-video-data">(\{.*?\})', webpage, 'json') except ExtractorError: raise ExtractorError('This post doesn\'t contain a video', expected=True) video_data = self._parse_json(video_json, video_id) timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp')) like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count')) return { 'id': video_id, 'url': video_data['sslSrc'], 'title': video_data['title'], 'description': video_data['description'], 'uploader': video_data['artistName'], 'thumbnail': video_data['artworkUrl'], 'timestamp': timestamp, 'like_count': like_count, } youtube-dl/youtube_dl/extractor/tv4.py0000644000000000000000000000705712645665720017150 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, parse_iso8601, ) class TV4IE(InfoExtractor): IE_DESC = 'tv4.se and tv4play.se' _VALID_URL = r'''(?x)https?://(?:www\.)? (?: tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: (?:program|barn)/(?:[^\?]+)\?video_id=| iframe/video/| film/| sport/| ) )(?P[0-9]+)''' _TESTS = [ { 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650', 'md5': '909d6454b87b10a25aa04c4bdd416a9b', 'info_dict': { 'id': '2491650', 'ext': 'mp4', 'title': 'Kalla Fakta 5 (english subtitles)', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': int, 'upload_date': '20131125', }, }, { 'url': 'http://www.tv4play.se/iframe/video/3054113', 'md5': '77f851c55139ffe0ebd41b6a5552489b', 'info_dict': { 'id': '3054113', 'ext': 'mp4', 'title': 'Så här jobbar ficktjuvarna - se avslöjande bilder', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'Unika bilder avslöjar hur turisternas fickor vittjas mitt på Stockholms central. Två experter på ficktjuvarna avslöjar knepen du ska se upp för.', 'timestamp': int, 'upload_date': '20150130', }, }, { 'url': 'http://www.tv4play.se/sport/3060959', 'only_matching': True, }, { 'url': 'http://www.tv4play.se/film/2378136', 'only_matching': True, }, { 'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412', 'only_matching': True, }, ] def _real_extract(self, url): video_id = self._match_id(url) info = self._download_json( 'http://www.tv4play.se/player/assets/%s.json' % video_id, video_id, 'Downloading video info JSON') # If is_geo_restricted is true, it doesn't necessarily mean we can't download it if info['is_geo_restricted']: self.report_warning('This content might not be available in your country due to licensing restrictions.') if info['requires_subscription']: raise ExtractorError('This content requires subscription.', expected=True) sources_data = self._download_json( 'https://prima.tv4play.se/api/web/asset/%s/play.json?protocol=http&videoFormat=MP4' % video_id, video_id, 'Downloading sources JSON') sources = sources_data['playback'] formats = [] for item in sources.get('items', {}).get('item', []): ext, bitrate = item['mediaFormat'], item['bitrate'] formats.append({ 'format_id': '%s_%s' % (ext, bitrate), 'tbr': bitrate, 'ext': ext, 'url': item['url'], }) self._sort_formats(formats) return { 'id': video_id, 'title': info['title'], 'formats': formats, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'duration': info.get('duration'), 'thumbnail': info.get('image'), 'is_live': sources.get('live'), } youtube-dl/youtube_dl/extractor/nowtv.py0000644000000000000000000002243712644050477017603 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, int_or_none, parse_iso8601, parse_duration, remove_start, ) class NowTVBaseIE(InfoExtractor): _VIDEO_FIELDS = ( 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'broadcastStartDate', 'seoUrl', 'duration', 'files', 'format.defaultImage169Format', 'format.defaultImage169Logo') def _extract_video(self, info, display_id=None): video_id = compat_str(info['id']) files = info['files'] if not files: if info.get('geoblocked', False): raise ExtractorError( 'Video %s is not available from your location due to geo restriction' % video_id, expected=True) if not info.get('free', True): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) formats = [] for item in files['items']: if determine_ext(item['path']) != 'f4v': continue app, play_path = remove_start(item['path'], '/').split('/', 1) formats.append({ 'url': 'rtmpe://fms.rtl.de', 'app': app, 'play_path': 'mp4:%s' % play_path, 'ext': 'flv', 'page_url': 'http://rtlnow.rtl.de', 'player_url': 'http://cdn.static-fra.de/now/vodplayer.swf', 'tbr': int_or_none(item.get('bitrate')), }) self._sort_formats(formats) title = info['title'] description = info.get('articleLong') or info.get('articleShort') timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') duration = parse_duration(info.get('duration')) f = info.get('format', {}) thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') return { 'id': video_id, 'display_id': display_id or info.get('seoUrl'), 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, 'formats': formats, } class NowTVIE(NowTVBaseIE): _WORKING = False _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P[^/]+)/(?:player|preview)' _TESTS = [{ # rtl 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player', 'info_dict': { 'id': '203519', 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', 'ext': 'flv', 'title': 'Inka Bause stellt die neuen Bauern vor', 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1432580700, 'upload_date': '20150525', 'duration': 2786, }, 'params': { # rtmp download 'skip_download': True, }, }, { # rtl2 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player', 'info_dict': { 'id': '203481', 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', 'ext': 'flv', 'title': 'Berlin - Tag & Nacht (Folge 934)', 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1432666800, 'upload_date': '20150526', 'duration': 2641, }, 'params': { # rtmp download 'skip_download': True, }, }, { # rtlnitro 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player', 'info_dict': { 'id': '165780', 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', 'ext': 'flv', 'title': 'Hals- und Beinbruch', 'description': 'md5:b50d248efffe244e6f56737f0911ca57', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1432415400, 'upload_date': '20150523', 'duration': 2742, }, 'params': { # rtmp download 'skip_download': True, }, }, { # superrtl 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player', 'info_dict': { 'id': '99205', 'display_id': 'medicopter-117/angst', 'ext': 'flv', 'title': 'Angst!', 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1222632900, 'upload_date': '20080928', 'duration': 3025, }, 'params': { # rtmp download 'skip_download': True, }, }, { # ntv 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player', 'info_dict': { 'id': '203521', 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', 'ext': 'flv', 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1432751700, 'upload_date': '20150527', 'duration': 1083, }, 'params': { # rtmp download 'skip_download': True, }, }, { # vox 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', 'info_dict': { 'id': '128953', 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', 'ext': 'flv', 'title': "Büro-Fall / Chihuahua 'Joel'", 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1432408200, 'upload_date': '20150523', 'duration': 3092, }, 'params': { # rtmp download 'skip_download': True, }, }, { 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview', 'only_matching': True, }, { 'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', 'only_matching': True, }, { 'url': 'http://www.nowtv.de/rtl2/echtzeit/list/aktuell/schnelles-geld-am-ende-der-welt/player', 'only_matching': True, }, { 'url': 'http://www.nowtv.de/rtl2/zuhause-im-glueck/jahr/2015/11/eine-erschuetternde-diagnose/player', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = '%s/%s' % (mobj.group('show_id'), mobj.group('id')) info = self._download_json( 'https://api.nowtv.de/v3/movies/%s?fields=%s' % (display_id, ','.join(self._VIDEO_FIELDS)), display_id) return self._extract_video(info, display_id) class NowTVListIE(NowTVBaseIE): _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P[^/]+)/list/(?P[^?/#&]+)$' _SHOW_FIELDS = ('title', ) _SEASON_FIELDS = ('id', 'headline', 'seoheadline', ) _TESTS = [{ 'url': 'http://www.nowtv.at/rtl/stern-tv/list/aktuell', 'info_dict': { 'id': '17006', 'title': 'stern TV - Aktuell', }, 'playlist_count': 1, }, { 'url': 'http://www.nowtv.at/rtl/das-supertalent/list/free-staffel-8', 'info_dict': { 'id': '20716', 'title': 'Das Supertalent - FREE Staffel 8', }, 'playlist_count': 14, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) show_id = mobj.group('show_id') season_id = mobj.group('id') fields = [] fields.extend(self._SHOW_FIELDS) fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS) fields.extend( 'formatTabs.formatTabPages.container.movies.%s' % field for field in self._VIDEO_FIELDS) list_info = self._download_json( 'https://api.nowtv.de/v3/formats/seo?fields=%s&name=%s.php' % (','.join(fields), show_id), season_id) season = next( season for season in list_info['formatTabs']['items'] if season.get('seoheadline') == season_id) title = '%s - %s' % (list_info['title'], season['headline']) entries = [] for container in season['formatTabPages']['items']: for info in ((container.get('container') or {}).get('movies') or {}).get('items') or []: entries.append(self._extract_video(info)) return self.playlist_result( entries, compat_str(season.get('id') or season_id), title) youtube-dl/youtube_dl/extractor/spike.py0000644000000000000000000000213312641030331017510 0ustar rootrootfrom __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor class SpikeIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)https?:// (?:www\.spike\.com/(?:video-(?:clips|playlists)|(?:full-)?episodes)/.+| m\.spike\.com/videos/video\.rbml\?id=(?P[^&]+)) ''' _TEST = { 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', 'md5': '1a9265f32b0c375793d6c4ce45255256', 'info_dict': { 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', 'ext': 'mp4', 'title': 'Auction Hunters|Can Allen Ride A Hundred Year-Old Motorcycle?', 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', }, } _FEED_URL = 'http://www.spike.com/feeds/mrss/' _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s' def _real_extract(self, url): mobile_id = self._match_id(url) if mobile_id: url = 'http://www.spike.com/video-clips/%s' % mobile_id return super(SpikeIE, self)._real_extract(url) youtube-dl/youtube_dl/extractor/tvigle.py0000644000000000000000000000752112641030331017675 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, int_or_none, parse_age_limit, ) class TvigleIE(InfoExtractor): IE_NAME = 'tvigle' IE_DESC = 'Интернет-телевидение Tvigle.ru' _VALID_URL = r'https?://(?:www\.)?(?:tvigle\.ru/(?:[^/]+/)+(?P[^/]+)/$|cloud\.tvigle\.ru/video/(?P\d+))' _TESTS = [ { 'url': 'http://www.tvigle.ru/video/sokrat/', 'md5': '36514aed3657d4f70b4b2cef8eb520cd', 'info_dict': { 'id': '1848932', 'display_id': 'sokrat', 'ext': 'flv', 'title': 'Сократ', 'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17', 'duration': 6586, 'age_limit': 12, }, 'skip': 'georestricted', }, { 'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/', 'md5': 'e7efe5350dd5011d0de6550b53c3ba7b', 'info_dict': { 'id': '5142516', 'ext': 'flv', 'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком', 'description': 'md5:027f7dc872948f14c96d19b4178428a4', 'duration': 186.080, 'age_limit': 0, }, 'skip': 'georestricted', }, { 'url': 'https://cloud.tvigle.ru/video/5267604/', 'only_matching': True, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') if not video_id: webpage = self._download_webpage(url, display_id) video_id = self._html_search_regex( r'class="video-preview current_playing" id="(\d+)">', webpage, 'video id') video_data = self._download_json( 'http://cloud.tvigle.ru/api/play/video/%s/' % video_id, display_id) item = video_data['playlist']['items'][0] videos = item.get('videos') error_message = item.get('errorMessage') if not videos and error_message: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) title = item['title'] description = item.get('description') thumbnail = item.get('thumbnail') duration = float_or_none(item.get('durationMilliseconds'), 1000) age_limit = parse_age_limit(item.get('ageRestrictions')) formats = [] for vcodec, fmts in item['videos'].items(): for format_id, video_url in fmts.items(): if format_id == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', m3u8_id=vcodec)) continue height = self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None) formats.append({ 'url': video_url, 'format_id': '%s-%s' % (vcodec, format_id), 'vcodec': vcodec, 'height': int_or_none(height), 'filesize': int_or_none(item.get('video_files_size', {}).get(vcodec, {}).get(format_id)), }) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'age_limit': age_limit, 'formats': formats, } youtube-dl/youtube_dl/extractor/fivemin.py0000644000000000000000000001212212641030331020031 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_parse_qs, compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( ExtractorError, parse_duration, replace_extension, ) class FiveMinIE(InfoExtractor): IE_NAME = '5min' _VALID_URL = r'''(?x) (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=| https?://(?:(?:massively|www)\.)?joystiq\.com/video/| 5min:) (?P\d+) ''' _TESTS = [ { # From http://www.engadget.com/2013/11/15/ipad-mini-retina-display-review/ 'url': 'http://pshared.5min.com/Scripts/PlayerSeed.js?sid=281&width=560&height=345&playList=518013791', 'md5': '4f7b0b79bf1a470e5004f7112385941d', 'info_dict': { 'id': '518013791', 'ext': 'mp4', 'title': 'iPad Mini with Retina Display Review', 'duration': 177, }, }, { # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247 'url': '5min:518086247', 'md5': 'e539a9dd682c288ef5a498898009f69e', 'info_dict': { 'id': '518086247', 'ext': 'mp4', 'title': 'How to Make a Next-Level Fruit Salad', 'duration': 184, }, }, ] _ERRORS = { 'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.', 'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.', 'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.', 'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.', 'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', 'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', } _QUALITIES = { 1: { 'width': 640, 'height': 360, }, 2: { 'width': 854, 'height': 480, }, 4: { 'width': 1280, 'height': 720, }, 8: { 'width': 1920, 'height': 1080, }, 16: { 'width': 640, 'height': 360, }, 32: { 'width': 854, 'height': 480, }, 64: { 'width': 1280, 'height': 720, }, 128: { 'width': 640, 'height': 360, }, } def _real_extract(self, url): video_id = self._match_id(url) embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id embed_page = self._download_webpage(embed_url, video_id, 'Downloading embed page') sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') query = compat_urllib_parse.urlencode({ 'func': 'GetResults', 'playlist': video_id, 'sid': sid, 'isPlayerSeed': 'true', 'url': embed_url, }) response = self._download_json( 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, video_id) if not response['success']: raise ExtractorError( '%s said: %s' % ( self.IE_NAME, self._ERRORS.get(response['errorMessage'], response['errorMessage'])), expected=True) info = response['binding'][0] formats = [] parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs( compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0]) for rendition in info['Renditions']: if rendition['RenditionType'] == 'm3u8': formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls')) elif rendition['RenditionType'] == 'aac': continue else: rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) quality = self._QUALITIES.get(rendition['ID'], {}) formats.append({ 'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']), 'url': rendition_url, 'width': quality.get('width'), 'height': quality.get('height'), }) self._sort_formats(formats) return { 'id': video_id, 'title': info['Title'], 'thumbnail': info.get('ThumbURL'), 'duration': parse_duration(info.get('Duration')), 'formats': formats, } youtube-dl/youtube_dl/extractor/baidu.py0000644000000000000000000000367212644050477017512 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import unescapeHTML class BaiduVideoIE(InfoExtractor): IE_DESC = '百度视频' _VALID_URL = r'http://v\.baidu\.com/(?P[a-z]+)/(?P\d+)\.htm' _TESTS = [{ 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', 'info_dict': { 'id': '1069', 'title': '中华小当家 TV版国语', 'description': 'md5:51be07afe461cf99fa61231421b5397c', }, 'playlist_count': 52, }, { 'url': 'http://v.baidu.com/show/11595.htm?frp=bdbrand', 'info_dict': { 'id': '11595', 'title': 're:^奔跑吧兄弟', 'description': 'md5:1bf88bad6d850930f542d51547c089b8', }, 'playlist_mincount': 12, }] def _call_api(self, path, category, playlist_id, note): return self._download_json('http://app.video.baidu.com/%s/?worktype=adnative%s&id=%s' % ( path, category, playlist_id), playlist_id, note) def _real_extract(self, url): category, playlist_id = re.match(self._VALID_URL, url).groups() if category == 'show': category = 'tvshow' if category == 'tv': category = 'tvplay' playlist_detail = self._call_api( 'xqinfo', category, playlist_id, 'Download playlist JSON metadata') playlist_title = playlist_detail['title'] playlist_description = unescapeHTML(playlist_detail.get('intro')) episodes_detail = self._call_api( 'xqsingle', category, playlist_id, 'Download episodes JSON metadata') entries = [self.url_result( episode['url'], video_title=episode['title'] ) for episode in episodes_detail['videos']] return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) youtube-dl/youtube_dl/extractor/udemy.py0000644000000000000000000002512712644050477017550 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_urllib_parse, compat_urllib_request, ) from ..utils import ( ExtractorError, float_or_none, int_or_none, sanitized_Request, unescapeHTML, ) class UdemyIE(InfoExtractor): IE_NAME = 'udemy' _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P\d+)' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' _NETRC_MACHINE = 'udemy' _TESTS = [{ 'url': 'https://www.udemy.com/java-tutorial/#/lecture/172757', 'md5': '98eda5b657e752cf945d8445e261b5c5', 'info_dict': { 'id': '160614', 'ext': 'mp4', 'title': 'Introduction and Installation', 'description': 'md5:c0d51f6f21ef4ec65f091055a5eef876', 'duration': 579.29, }, 'skip': 'Requires udemy account credentials', }] def _enroll_course(self, webpage, course_id): checkout_url = unescapeHTML(self._search_regex( r'href=(["\'])(?Phttps?://(?:www\.)?udemy\.com/payment/checkout/.+?)\1', webpage, 'checkout url', group='url', default=None)) if checkout_url: raise ExtractorError( 'Course %s is not free. You have to pay for it before you can download. ' 'Use this URL to confirm purchase: %s' % (course_id, checkout_url), expected=True) enroll_url = unescapeHTML(self._search_regex( r'href=(["\'])(?Phttps?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', webpage, 'enroll url', group='url', default=None)) if enroll_url: webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') if '>You have enrolled in' in webpage: self.to_screen('%s: Successfully enrolled in the course' % course_id) def _download_lecture(self, course_id, lecture_id): return self._download_json( 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( course_id, lecture_id, compat_urllib_parse.urlencode({ 'video_only': '', 'auto_play': '', 'fields[lecture]': 'title,description,asset', 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', 'instructorPreviewMode': 'False', })), lecture_id, 'Downloading lecture JSON') def _handle_error(self, response): if not isinstance(response, dict): return error = response.get('error') if error: error_str = 'Udemy returned error #%s: %s' % (error.get('code'), error.get('message')) error_data = error.get('data') if error_data: error_str += ' - %s' % error_data.get('formErrors') raise ExtractorError(error_str, expected=True) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): headers = { 'X-Udemy-Snail-Case': 'true', 'X-Requested-With': 'XMLHttpRequest', } for cookie in self._downloader.cookiejar: if cookie.name == 'client_id': headers['X-Udemy-Client-Id'] = cookie.value elif cookie.name == 'access_token': headers['X-Udemy-Bearer-Token'] = cookie.value headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value if isinstance(url_or_request, compat_urllib_request.Request): for header, value in headers.items(): url_or_request.add_header(header, value) else: url_or_request = sanitized_Request(url_or_request, headers=headers) response = super(UdemyIE, self)._download_json(url_or_request, video_id, note) self._handle_error(response) return response def _real_initialize(self): self._login() def _login(self): (username, password) = self._get_login_info() if username is None: return login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') def is_logged(webpage): return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<']) # already logged in if is_logged(login_popup): return login_form = self._form_hidden_inputs('login-form', login_popup) login_form.update({ 'email': username.encode('utf-8'), 'password': password.encode('utf-8'), }) request = sanitized_Request( self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) request.add_header('Referer', self._ORIGIN_URL) request.add_header('Origin', self._ORIGIN_URL) response = self._download_webpage( request, None, 'Logging in as %s' % username) if not is_logged(response): error = self._html_search_regex( r'(?s)]+class="form-errors[^"]*">(.+?)
    ', response, 'error message', default=None) if error: raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') def _real_extract(self, url): lecture_id = self._match_id(url) webpage = self._download_webpage(url, lecture_id) course_id = self._search_regex( r'data-course-id=["\'](\d+)', webpage, 'course id') try: lecture = self._download_lecture(course_id, lecture_id) except ExtractorError as e: # Error could possibly mean we are not enrolled in the course if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: self._enroll_course(webpage, course_id) lecture = self._download_lecture(course_id, lecture_id) else: raise title = lecture['title'] description = lecture.get('description') asset = lecture['asset'] asset_type = asset.get('assetType') or asset.get('asset_type') if asset_type != 'Video': raise ExtractorError( 'Lecture %s is not a video' % lecture_id, expected=True) stream_url = asset.get('streamUrl') or asset.get('stream_url') if stream_url: youtube_url = self._search_regex( r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None) if youtube_url: return self.url_result(youtube_url, 'Youtube') video_id = asset['id'] thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') duration = float_or_none(asset.get('data', {}).get('duration')) outputs = asset.get('data', {}).get('outputs', {}) formats = [] for format_ in asset.get('download_urls', {}).get('Video', []): video_url = format_.get('file') if not video_url: continue format_id = format_.get('label') f = { 'url': format_['file'], 'height': int_or_none(format_id), } if format_id: # Some videos contain additional metadata (e.g. # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) output = outputs.get(format_id) if isinstance(output, dict): f.update({ 'format_id': '%sp' % (output.get('label') or format_id), 'width': int_or_none(output.get('width')), 'height': int_or_none(output.get('height')), 'vbr': int_or_none(output.get('video_bitrate_in_kbps')), 'vcodec': output.get('video_codec'), 'fps': int_or_none(output.get('frame_rate')), 'abr': int_or_none(output.get('audio_bitrate_in_kbps')), 'acodec': output.get('audio_codec'), 'asr': int_or_none(output.get('audio_sample_rate')), 'tbr': int_or_none(output.get('total_bitrate_in_kbps')), 'filesize': int_or_none(output.get('file_size_in_bytes')), }) else: f['format_id'] = '%sp' % format_id formats.append(f) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats } class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' _VALID_URL = r'https?://www\.udemy\.com/(?P[\da-z-]+)' _TESTS = [] @classmethod def suitable(cls, url): return False if UdemyIE.suitable(url) else super(UdemyCourseIE, cls).suitable(url) def _real_extract(self, url): course_path = self._match_id(url) webpage = self._download_webpage(url, course_path) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s' % course_path, course_path, 'Downloading course JSON') course_id = response['id'] course_title = response.get('title') self._enroll_course(webpage, course_id) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, course_id, 'Downloading course curriculum') entries = [] chapter, chapter_number = None, None for asset in response: asset_type = asset.get('assetType') or asset.get('asset_type') if asset_type == 'Video': asset_id = asset.get('id') if asset_id: entry = { '_type': 'url_transparent', 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), 'ie_key': UdemyIE.ie_key(), } if chapter_number: entry['chapter_number'] = chapter_number if chapter: entry['chapter'] = chapter entries.append(entry) elif asset.get('type') == 'chapter': chapter_number = asset.get('index') or asset.get('object_index') chapter = asset.get('title') return self.playlist_result(entries, course_id, course_title) youtube-dl/youtube_dl/extractor/channel9.py0000644000000000000000000002614512641030331020107 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, parse_filesize, qualities, ) class Channel9IE(InfoExtractor): ''' Common extractor for channel9.msdn.com. The type of provided URL (video or playlist) is determined according to meta Search.PageType from web page HTML rather than URL itself, as it is not always possible to do. ''' IE_DESC = 'Channel 9' IE_NAME = 'channel9' _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P.+)/?' _TESTS = [ { 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', 'md5': 'bbd75296ba47916b754e73c3a4bbdf10', 'info_dict': { 'id': 'Events/TechEd/Australia/2013/KOS002', 'ext': 'mp4', 'title': 'Developer Kick-Off Session: Stuff We Love', 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 'duration': 4576, 'thumbnail': 're:http://.*\.jpg', 'session_code': 'KOS002', 'session_day': 'Day 1', 'session_room': 'Arena 1A', 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'], }, }, { 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', 'md5': 'b43ee4529d111bc37ba7ee4f34813e68', 'info_dict': { 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', 'ext': 'mp4', 'title': 'Self-service BI with Power BI - nuclear testing', 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 'duration': 1540, 'thumbnail': 're:http://.*\.jpg', 'authors': ['Mike Wilmot'], }, }, { # low quality mp4 is best 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', 'info_dict': { 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', 'ext': 'mp4', 'title': 'Ranges for the Standard Library', 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', 'duration': 5646, 'thumbnail': 're:http://.*\.jpg', }, 'params': { 'skip_download': True, }, } ] _RSS_URL = 'http://channel9.msdn.com/%s/RSS' def _formats_from_html(self, html): FORMAT_REGEX = r''' (?x) [^"]+)">(?P[^<]+)\s* \((?P[^\)]+)\)\s* (?:\s*

    File\s+size

    \s*(?P.*?)\s* )? # File size part may be missing ''' quality = qualities(( 'MP3', 'MP4', 'Low Quality WMV', 'Low Quality MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4')) formats = [{ 'url': x.group('url'), 'format_id': x.group('quality'), 'format_note': x.group('note'), 'format': '%s (%s)' % (x.group('quality'), x.group('note')), 'filesize_approx': parse_filesize(x.group('filesize')), 'quality': quality(x.group('quality')), 'vcodec': 'none' if x.group('note') == 'Audio only' else None, } for x in list(re.finditer(FORMAT_REGEX, html))] self._sort_formats(formats) return formats def _extract_title(self, html): title = self._html_search_meta('title', html, 'title') if title is None: title = self._og_search_title(html) TITLE_SUFFIX = ' (Channel 9)' if title is not None and title.endswith(TITLE_SUFFIX): title = title[:-len(TITLE_SUFFIX)] return title def _extract_description(self, html): DESCRIPTION_REGEX = r'''(?sx) \s* \s* (?P.+?)\s* \s* ''' m = re.search(DESCRIPTION_REGEX, html) if m is not None: return m.group('description') return self._html_search_meta('description', html, 'description') def _extract_duration(self, html): m = re.search(r'"length": *"(?P\d{2}):(?P\d{2}):(?P\d{2})"', html) return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None def _extract_slides(self, html): m = re.search(r'Slides', html) return m.group('slidesurl') if m is not None else None def _extract_zip(self, html): m = re.search(r'Zip', html) return m.group('zipurl') if m is not None else None def _extract_avg_rating(self, html): m = re.search(r'

    Avg Rating: (?P[^<]+)

    ', html) return float(m.group('avgrating')) if m is not None else 0 def _extract_rating_count(self, html): m = re.search(r'
    \((?P[^<]+)\)
    ', html) return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0 def _extract_view_count(self, html): m = re.search(r'
  • \s*(?P[^<]+) Views\s*
  • ', html) return int(self._fix_count(m.group('viewcount'))) if m is not None else 0 def _extract_comment_count(self, html): m = re.search(r'
  • \s*\s*(?P[^<]+) Comments\s*\s*
  • ', html) return int(self._fix_count(m.group('commentcount'))) if m is not None else 0 def _fix_count(self, count): return int(str(count).replace(',', '')) if count is not None else None def _extract_authors(self, html): m = re.search(r'(?s)
  • (.*?)
  • ', html) if m is None: return None return re.findall(r'([^<]+)', m.group(1)) def _extract_session_code(self, html): m = re.search(r'
  • \s*(?P.+?)\s*
  • ', html) return m.group('code') if m is not None else None def _extract_session_day(self, html): m = re.search(r'
  • \s*(?P[^<]+)\s*
  • ', html) return m.group('day').strip() if m is not None else None def _extract_session_room(self, html): m = re.search(r'
  • \s*(?P.+?)\s*
  • ', html) return m.group('room') if m is not None else None def _extract_session_speakers(self, html): return re.findall(r'([^<]+)', html) def _extract_content(self, html, content_path): # Look for downloadable content formats = self._formats_from_html(html) slides = self._extract_slides(html) zip_ = self._extract_zip(html) # Nothing to download if len(formats) == 0 and slides is None and zip_ is None: self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path) return # Extract meta title = self._extract_title(html) description = self._extract_description(html) thumbnail = self._og_search_thumbnail(html) duration = self._extract_duration(html) avg_rating = self._extract_avg_rating(html) rating_count = self._extract_rating_count(html) view_count = self._extract_view_count(html) comment_count = self._extract_comment_count(html) common = { '_type': 'video', 'id': content_path, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'avg_rating': avg_rating, 'rating_count': rating_count, 'view_count': view_count, 'comment_count': comment_count, } result = [] if slides is not None: d = common.copy() d.update({'title': title + '-Slides', 'url': slides}) result.append(d) if zip_ is not None: d = common.copy() d.update({'title': title + '-Zip', 'url': zip_}) result.append(d) if len(formats) > 0: d = common.copy() d.update({'title': title, 'formats': formats}) result.append(d) return result def _extract_entry_item(self, html, content_path): contents = self._extract_content(html, content_path) if contents is None: return contents if len(contents) > 1: raise ExtractorError('Got more than one entry') result = contents[0] result['authors'] = self._extract_authors(html) return result def _extract_session(self, html, content_path): contents = self._extract_content(html, content_path) if contents is None: return contents session_meta = { 'session_code': self._extract_session_code(html), 'session_day': self._extract_session_day(html), 'session_room': self._extract_session_room(html), 'session_speakers': self._extract_session_speakers(html), } for content in contents: content.update(session_meta) return self.playlist_result(contents) def _extract_list(self, content_path): rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS') entries = [self.url_result(session_url.text, 'Channel9') for session_url in rss.findall('./channel/item/link')] title_text = rss.find('./channel/title').text return self.playlist_result(entries, content_path, title_text) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) content_path = mobj.group('contentpath') webpage = self._download_webpage(url, content_path, 'Downloading web page') page_type_m = re.search(r'', webpage) if page_type_m is not None: page_type = page_type_m.group('pagetype') if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content return self._extract_entry_item(webpage, content_path) elif page_type == 'Session': # Event session page, may contain downloadable content return self._extract_session(webpage, content_path) elif page_type == 'Event': return self._extract_list(content_path) else: raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True) else: # Assuming list return self._extract_list(content_path) youtube-dl/youtube_dl/extractor/pornhub.py0000644000000000000000000001546412662061715020102 0ustar rootrootfrom __future__ import unicode_literals import os import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlparse, ) from ..utils import ( ExtractorError, int_or_none, sanitized_Request, str_to_int, ) from ..aes import ( aes_decrypt_text ) class PornHubIE(InfoExtractor): _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P[0-9a-z]+)' _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': '1e19b41231a02eba417839222ac9d58e', 'info_dict': { 'id': '648719015', 'ext': 'mp4', 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', 'uploader': 'Babes', 'duration': 361, 'view_count': int, 'like_count': int, 'dislike_count': int, 'comment_count': int, 'age_limit': 18, } }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, }, { 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', 'only_matching': True, }] @classmethod def _extract_url(cls, webpage): mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage) if mobj: return mobj.group('url') def _extract_count(self, pattern, webpage, name): return str_to_int(self._search_regex( pattern, webpage, '%s count' % name, fatal=False)) def _real_extract(self, url): video_id = self._match_id(url) req = sanitized_Request( 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) error_msg = self._html_search_regex( r'(?s)
    (.*?)
    ', webpage, 'error message', default=None) if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) raise ExtractorError( 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) flashvars = self._parse_json( self._search_regex( r'var\s+flashv1ars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), video_id) if flashvars: video_title = flashvars.get('video_title') thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) else: video_title, thumbnail, duration = [None] * 3 if not video_title: video_title = self._html_search_regex(r'

    ]+>([^<]+)', webpage, 'title') video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<', webpage, 'uploader', fatal=False) view_count = self._extract_count( r'([\d,\.]+) views', webpage, 'view') like_count = self._extract_count( r'([\d,\.]+)', webpage, 'like') dislike_count = self._extract_count( r'([\d,\.]+)', webpage, 'dislike') comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage))) if webpage.find('"encrypted":true') != -1: password = compat_urllib_parse_unquote_plus( self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) formats = [] for video_url in video_urls: path = compat_urllib_parse_urlparse(video_url).path extension = os.path.splitext(path)[1][1:] format = path.split('/')[5].split('_')[:2] format = '-'.join(format) m = re.match(r'^(?P[0-9]+)[pP]-(?P[0-9]+)[kK]$', format) if m is None: height = None tbr = None else: height = int(m.group('height')) tbr = int(m.group('tbr')) formats.append({ 'url': video_url, 'ext': extension, 'format': format, 'format_id': format, 'tbr': tbr, 'height': height, }) self._sort_formats(formats) return { 'id': video_id, 'uploader': video_uploader, 'title': video_title, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, 'formats': formats, 'age_limit': 18, } class PornHubPlaylistBaseIE(InfoExtractor): def _extract_entries(self, webpage): return [ self.url_result('http://www.pornhub.com/%s' % video_url, PornHubIE.ie_key()) for video_url in set(re.findall( r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) ] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) entries = self._extract_entries(webpage) playlist = self._parse_json( self._search_regex( r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'), playlist_id) return self.playlist_result( entries, playlist_id, playlist.get('title'), playlist.get('description')) class PornHubPlaylistIE(PornHubPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P\d+)' _TESTS = [{ 'url': 'http://www.pornhub.com/playlist/6201671', 'info_dict': { 'id': '6201671', 'title': 'P0p4', }, 'playlist_mincount': 35, }] class PornHubUserVideosIE(PornHubPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P[^/]+)/videos' _TESTS = [{ 'url': 'http://www.pornhub.com/users/rushandlia/videos', 'info_dict': { 'id': 'rushandlia', }, 'playlist_mincount': 13, }] def _real_extract(self, url): user_id = self._match_id(url) webpage = self._download_webpage(url, user_id) return self.playlist_result(self._extract_entries(webpage), user_id) youtube-dl/youtube_dl/extractor/wsj.py0000644000000000000000000000650712641030331017211 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, ) class WSJIE(InfoExtractor): _VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P[a-zA-Z0-9-]+)' IE_DESC = 'Wall Street Journal' _TEST = { 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 'md5': '9747d7a6ebc2f4df64b981e1dde9efa9', 'info_dict': { 'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 'ext': 'mp4', 'upload_date': '20150202', 'uploader_id': 'jdesai', 'creator': 'jdesai', 'categories': list, # a long list 'duration': 90, 'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo', }, } def _real_extract(self, url): video_id = self._match_id(url) bitrates = [128, 174, 264, 320, 464, 664, 1264] api_url = ( 'http://video-api.wsj.com/api-video/find_all_videos.asp?' 'type=guid&count=1&query=%s&' 'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,' 'author,description,name,linkURL,videoStillURL,duration,videoURL,' 'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,' 'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,' 'allthingsd-subsection,sm-section,sm-subsection,provider,' 'formattedCreationDate,keywords,keywordsOmniture,column,editor,' 'emailURL,emailPartnerID,showName,omnitureProgramName,' 'omnitureVideoFormat,linkRelativeURL,touchCastID,' 'omniturePublishDate,%s') % ( video_id, ','.join('video%dkMP4Url' % br for br in bitrates)) info = self._download_json(api_url, video_id)['items'][0] # Thumbnails are conveniently in the correct format already thumbnails = info.get('thumbnailList') creator = info.get('author') uploader_id = info.get('editor') categories = info.get('keywords') duration = int_or_none(info.get('duration')) upload_date = unified_strdate( info.get('formattedCreationDate'), day_first=False) title = info.get('name', info.get('titletag')) formats = [{ 'format_id': 'f4m', 'format_note': 'f4m (meta URL)', 'url': info['videoURL'], }] if info.get('hls'): formats.extend(self._extract_m3u8_formats( info['hls'], video_id, ext='mp4', preference=0, entry_protocol='m3u8_native')) for br in bitrates: field = 'video%dkMP4Url' % br if info.get(field): formats.append({ 'format_id': 'mp4-%d' % br, 'container': 'mp4', 'tbr': br, 'url': info[field], }) self._sort_formats(formats) return { 'id': video_id, 'formats': formats, 'thumbnails': thumbnails, 'creator': creator, 'uploader_id': uploader_id, 'duration': duration, 'upload_date': upload_date, 'title': title, 'categories': categories, } youtube-dl/youtube_dl/extractor/metacritic.py0000644000000000000000000000400312641030331020517 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( fix_xml_ampersands, ) class MetacriticIE(InfoExtractor): _VALID_URL = r'https?://www\.metacritic\.com/.+?/trailers/(?P\d+)' _TEST = { 'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222', 'info_dict': { 'id': '3698222', 'ext': 'mp4', 'title': 'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors', 'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.', 'duration': 221, }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) # The xml is not well formatted, there are raw '&' info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, video_id, 'Downloading info xml', transform_source=fix_xml_ampersands) clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) formats = [] for videoFile in clip.findall('httpURI/videoFile'): rate_str = videoFile.find('rate').text video_url = videoFile.find('filePath').text formats.append({ 'url': video_url, 'ext': 'mp4', 'format_id': rate_str, 'tbr': int(rate_str), }) self._sort_formats(formats) description = self._html_search_regex(r'Description:(.*?)

    ', webpage, 'description', flags=re.DOTALL) return { 'id': video_id, 'title': clip.find('title').text, 'formats': formats, 'description': description, 'duration': int(clip.find('duration').text), } youtube-dl/youtube_dl/extractor/videomore.py0000644000000000000000000002111312656352065020406 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, parse_age_limit, parse_iso8601, xpath_text, ) class VideomoreIE(InfoExtractor): IE_NAME = 'videomore' _VALID_URL = r'videomore:(?P\d+)$|https?://videomore\.ru/(?:(?:embed|[^/]+/[^/]+)/|[^/]+\?.*\btrack_id=)(?P\d+)(?:[/?#&]|\.(?:xml|json)|$)' _TESTS = [{ 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', 'md5': '70875fbf57a1cd004709920381587185', 'info_dict': { 'id': '367617', 'ext': 'flv', 'title': 'В гостях Алексей Чумаков и Юлия Ковальчук', 'description': 'В гостях – лучшие романтические комедии года, «Выживший» Иньярриту и «Стив Джобс» Дэнни Бойла.', 'series': 'Кино в деталях', 'episode': 'В гостях Алексей Чумаков и Юлия Ковальчук', 'episode_number': None, 'season': 'Сезон 2015', 'season_number': 5, 'thumbnail': 're:^https?://.*\.jpg', 'duration': 2910, 'age_limit': 16, 'view_count': int, }, }, { 'url': 'http://videomore.ru/embed/259974', 'info_dict': { 'id': '259974', 'ext': 'flv', 'title': '80 серия', 'description': '«Медведей» ждет решающий матч. Макеев выясняет отношения со Стрельцовым. Парни узнают подробности прошлого Макеева.', 'series': 'Молодежка', 'episode': '80 серия', 'episode_number': 40, 'season': '2 сезон', 'season_number': 2, 'thumbnail': 're:^https?://.*\.jpg', 'duration': 2809, 'age_limit': 16, 'view_count': int, }, 'params': { 'skip_download': True, }, }, { 'url': 'http://videomore.ru/molodezhka/sezon_promo/341073', 'info_dict': { 'id': '341073', 'ext': 'flv', 'title': 'Команда проиграла из-за Бакина?', 'description': 'Молодежка 3 сезон скоро', 'series': 'Молодежка', 'episode': 'Команда проиграла из-за Бакина?', 'episode_number': None, 'season': 'Промо', 'season_number': 99, 'thumbnail': 're:^https?://.*\.jpg', 'duration': 29, 'age_limit': 16, 'view_count': int, }, 'params': { 'skip_download': True, }, }, { 'url': 'http://videomore.ru/elki_3?track_id=364623', 'only_matching': True, }, { 'url': 'http://videomore.ru/embed/364623', 'only_matching': True, }, { 'url': 'http://videomore.ru/video/tracks/364623.xml', 'only_matching': True, }, { 'url': 'http://videomore.ru/video/tracks/364623.json', 'only_matching': True, }, { 'url': 'http://videomore.ru/video/tracks/158031/quotes/33248', 'only_matching': True, }, { 'url': 'videomore:367617', 'only_matching': True, }] @staticmethod def _extract_url(webpage): mobj = re.search( r']+data=(["\'])https?://videomore.ru/player\.swf\?.*config=(?Phttps?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1', webpage) if mobj: return mobj.group('url') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('sid') or mobj.group('id') video = self._download_xml( 'http://videomore.ru/video/tracks/%s.xml' % video_id, video_id, 'Downloading video XML') video_url = xpath_text(video, './/video_url', 'video url', fatal=True) formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds') data = self._download_json( 'http://videomore.ru/video/tracks/%s.json' % video_id, video_id, 'Downloading video JSON') title = data.get('title') or data['project_title'] description = data.get('description') or data.get('description_raw') timestamp = parse_iso8601(data.get('published_at')) duration = int_or_none(data.get('duration')) view_count = int_or_none(data.get('views')) age_limit = parse_age_limit(data.get('min_age')) thumbnails = [{ 'url': thumbnail, } for thumbnail in data.get('big_thumbnail_urls', [])] series = data.get('project_title') episode = data.get('title') episode_number = int_or_none(data.get('episode_of_season') or None) season = data.get('season_title') season_number = int_or_none(data.get('season_pos') or None) return { 'id': video_id, 'title': title, 'description': description, 'series': series, 'episode': episode, 'episode_number': episode_number, 'season': season, 'season_number': season_number, 'thumbnails': thumbnails, 'timestamp': timestamp, 'duration': duration, 'view_count': view_count, 'age_limit': age_limit, 'formats': formats, } class VideomoreVideoIE(InfoExtractor): IE_NAME = 'videomore:video' _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P[^/?#&]+)[/?#&]*$' _TESTS = [{ # single video with og:video:iframe 'url': 'http://videomore.ru/elki_3', 'info_dict': { 'id': '364623', 'ext': 'flv', 'title': 'Ёлки 3', 'description': '', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 5579, 'age_limit': 6, 'view_count': int, }, 'params': { 'skip_download': True, }, }, { # season single series with og:video:iframe 'url': 'http://videomore.ru/poslednii_ment/1_sezon/14_seriya', 'only_matching': True, }, { 'url': 'http://videomore.ru/sejchas_v_seti/serii_221-240/226_vypusk', 'only_matching': True, }, { # single video without og:video:iframe 'url': 'http://videomore.ru/marin_i_ego_druzya', 'info_dict': { 'id': '359073', 'ext': 'flv', 'title': '1 серия. Здравствуй, Аквавилль!', 'description': 'md5:c6003179538b5d353e7bcd5b1372b2d7', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 754, 'age_limit': 6, 'view_count': int, }, 'params': { 'skip_download': True, }, }] @classmethod def suitable(cls, url): return False if VideomoreIE.suitable(url) else super(VideomoreVideoIE, cls).suitable(url) def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_url = self._og_search_property( 'video:iframe', webpage, 'video url', default=None) if not video_url: video_id = self._search_regex( (r'config\s*:\s*["\']https?://videomore\.ru/video/tracks/(\d+)\.xml', r'track-id=["\'](\d+)', r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id') video_url = 'videomore:%s' % video_id return self.url_result(video_url, VideomoreIE.ie_key()) class VideomoreSeasonIE(InfoExtractor): IE_NAME = 'videomore:season' _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P[^/]+/[^/?#&]+)[/?#&]*$' _TESTS = [{ 'url': 'http://videomore.ru/molodezhka/sezon_promo', 'info_dict': { 'id': 'molodezhka/sezon_promo', 'title': 'Молодежка Промо', }, 'playlist_mincount': 12, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) title = self._og_search_title(webpage) entries = [ self.url_result(item) for item in re.findall( r']+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' % display_id, webpage)] return self.playlist_result(entries, display_id, title) youtube-dl/youtube_dl/extractor/ministrygrid.py0000644000000000000000000000404412641030331021124 0ustar rootrootfrom __future__ import unicode_literals import json import re from .common import InfoExtractor from ..utils import ( ExtractorError, smuggle_url, ) class MinistryGridIE(InfoExtractor): _VALID_URL = r'https?://www\.ministrygrid.com/([^/?#]*/)*(?P[^/#?]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers', 'md5': '844be0d2a1340422759c2a9101bab017', 'info_dict': { 'id': '3453494717001', 'ext': 'mp4', 'title': 'The Gospel by Numbers', 'description': 'Coming soon from T4G 2014!', 'uploader': 'LifeWay Christian Resources (MG)', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) portlets_json = self._search_regex( r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list') portlets = json.loads(portlets_json) pl_id = self._search_regex( r' {1}'.format(start, stop) else: yield line return '\r\n'.join(_fix_subtitle(subtitles)) youtube-dl/youtube_dl/extractor/fivetv.py0000644000000000000000000000561712641030331017712 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import int_or_none class FiveTVIE(InfoExtractor): _VALID_URL = r'''(?x) http:// (?:www\.)?5-tv\.ru/ (?: (?:[^/]+/)+(?P\d+)| (?P[^/?#]+)(?:[/?#])? ) ''' _TESTS = [{ 'url': 'http://5-tv.ru/news/96814/', 'md5': 'bbff554ad415ecf5416a2f48c22d9283', 'info_dict': { 'id': '96814', 'ext': 'mp4', 'title': 'Россияне выбрали имя для общенациональной платежной системы', 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 180, }, }, { 'url': 'http://5-tv.ru/video/1021729/', 'info_dict': { 'id': '1021729', 'ext': 'mp4', 'title': '3D принтер', 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 180, }, }, { 'url': 'http://www.5-tv.ru/glavnoe/#itemDetails', 'info_dict': { 'id': 'glavnoe', 'ext': 'mp4', 'title': 'Итоги недели с 8 по 14 июня 2015 года', 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/', 'only_matching': True, }, { 'url': 'http://5-tv.ru/films/1507502/', 'only_matching': True, }, { 'url': 'http://5-tv.ru/programs/broadcast/508713/', 'only_matching': True, }, { 'url': 'http://5-tv.ru/angel/', 'only_matching': True, }, { 'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') or mobj.group('path') webpage = self._download_webpage(url, video_id) video_url = self._search_regex( r']+?href="([^"]+)"[^>]+?class="videoplayer"', webpage, 'video url') title = self._og_search_title(webpage, default=None) or self._search_regex( r'([^<]+)', webpage, 'title') duration = int_or_none(self._og_search_property( 'video:duration', webpage, 'duration', default=None)) return { 'id': video_id, 'url': video_url, 'title': title, 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'duration': duration, } youtube-dl/youtube_dl/extractor/freesound.py0000644000000000000000000000256012641030331020373 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor class FreesoundIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?freesound\.org/people/([^/]+)/sounds/(?P[^/]+)' _TEST = { 'url': 'http://www.freesound.org/people/miklovan/sounds/194503/', 'md5': '12280ceb42c81f19a515c745eae07650', 'info_dict': { 'id': '194503', 'ext': 'mp3', 'title': 'gulls in the city.wav', 'uploader': 'miklovan', 'description': 'the sounds of seagulls in the city', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) music_id = mobj.group('id') webpage = self._download_webpage(url, music_id) title = self._html_search_regex( r'
    .*?(.+?)', webpage, 'music title', flags=re.DOTALL) description = self._html_search_regex( r'
    (.*?)
    ', webpage, 'description', fatal=False, flags=re.DOTALL) return { 'id': music_id, 'title': title, 'url': self._og_search_property('audio', webpage, 'music url'), 'uploader': self._og_search_property('audio:artist', webpage, 'music uploader'), 'description': description, } youtube-dl/youtube_dl/extractor/yandexmusic.py0000644000000000000000000001476212641030331020741 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re import hashlib from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, ) from ..utils import ( int_or_none, float_or_none, sanitized_Request, ) class YandexMusicTrackIE(InfoExtractor): IE_NAME = 'yandexmusic:track' IE_DESC = 'Яндекс.Музыка - Трек' _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P\d+)/track/(?P\d+)' _TEST = { 'url': 'http://music.yandex.ru/album/540508/track/4878838', 'md5': 'f496818aa2f60b6c0062980d2e00dc20', 'info_dict': { 'id': '4878838', 'ext': 'mp3', 'title': 'Carlo Ambrosio - Gypsy Eyes 1', 'filesize': 4628061, 'duration': 193.04, } } def _get_track_url(self, storage_dir, track_id): data = self._download_json( 'http://music.yandex.ru/api/v1.5/handlers/api-jsonp.jsx?action=getTrackSrc&p=download-info/%s' % storage_dir, track_id, 'Downloading track location JSON') key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + data['path'][1:] + data['s']).encode('utf-8')).hexdigest() storage = storage_dir.split('.') return ('http://%s/get-mp3/%s/%s?track-id=%s&from=service-10-track&similarities-experiment=default' % (data['host'], key, data['ts'] + data['path'], storage[1])) def _get_track_info(self, track): thumbnail = None cover_uri = track.get('albums', [{}])[0].get('coverUri') if cover_uri: thumbnail = cover_uri.replace('%%', 'orig') if not thumbnail.startswith('http'): thumbnail = 'http://' + thumbnail return { 'id': track['id'], 'ext': 'mp3', 'url': self._get_track_url(track['storageDir'], track['id']), 'title': '%s - %s' % (track['artists'][0]['name'], track['title']), 'filesize': int_or_none(track.get('fileSize')), 'duration': float_or_none(track.get('durationMs'), 1000), 'thumbnail': thumbnail, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) album_id, track_id = mobj.group('album_id'), mobj.group('id') track = self._download_json( 'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id), track_id, 'Downloading track JSON')['track'] return self._get_track_info(track) class YandexMusicPlaylistBaseIE(InfoExtractor): def _build_playlist(self, tracks): return [ self.url_result( 'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id'])) for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)] class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): IE_NAME = 'yandexmusic:album' IE_DESC = 'Яндекс.Музыка - Альбом' _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P\d+)/?(\?|$)' _TEST = { 'url': 'http://music.yandex.ru/album/540508', 'info_dict': { 'id': '540508', 'title': 'Carlo Ambrosio - Gypsy Soul (2009)', }, 'playlist_count': 50, } def _real_extract(self, url): album_id = self._match_id(url) album = self._download_json( 'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id, album_id, 'Downloading album JSON') entries = self._build_playlist(album['volumes'][0]) title = '%s - %s' % (album['artists'][0]['name'], album['title']) year = album.get('year') if year: title += ' (%s)' % year return self.playlist_result(entries, compat_str(album['id']), title) class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): IE_NAME = 'yandexmusic:playlist' IE_DESC = 'Яндекс.Музыка - Плейлист' _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P\d+)' _TESTS = [{ 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', 'info_dict': { 'id': '1245', 'title': 'Что слушают Enter Shikari', 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', }, 'playlist_count': 6, }, { # playlist exceeding the limit of 150 tracks shipped with webpage (see # https://github.com/rg3/youtube-dl/issues/6666) 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036', 'info_dict': { 'id': '1036', 'title': 'Музыка 90-х', }, 'playlist_count': 310, }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) mu = self._parse_json( self._search_regex( r'var\s+Mu\s*=\s*({.+?});\s*', webpage, 'player'), playlist_id) playlist = mu['pageData']['playlist'] tracks, track_ids = playlist['tracks'], playlist['trackIds'] # tracks dictionary shipped with webpage is limited to 150 tracks, # missing tracks should be retrieved manually. if len(tracks) < len(track_ids): present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) request = sanitized_Request( 'https://music.yandex.ru/handlers/track-entries.jsx', compat_urllib_parse.urlencode({ 'entries': ','.join(missing_track_ids), 'lang': mu.get('settings', {}).get('lang', 'en'), 'external-domain': 'music.yandex.ru', 'overembed': 'false', 'sign': mu.get('authData', {}).get('user', {}).get('sign'), 'strict': 'true', }).encode('utf-8')) request.add_header('Referer', url) request.add_header('X-Requested-With', 'XMLHttpRequest') missing_tracks = self._download_json( request, playlist_id, 'Downloading missing tracks JSON', fatal=False) if missing_tracks: tracks.extend(missing_tracks) return self.playlist_result( self._build_playlist(tracks), compat_str(playlist_id), playlist['title'], playlist.get('description')) youtube-dl/youtube_dl/extractor/ssa.py0000644000000000000000000000352112641030331017165 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( unescapeHTML, parse_duration, ) class SSAIE(InfoExtractor): _VALID_URL = r'http://ssa\.nls\.uk/film/(?P\d+)' _TEST = { 'url': 'http://ssa.nls.uk/film/3561', 'info_dict': { 'id': '3561', 'ext': 'flv', 'title': 'SHETLAND WOOL', 'description': 'md5:c5afca6871ad59b4271e7704fe50ab04', 'duration': 900, 'thumbnail': 're:^https?://.*\.jpg$', }, 'params': { # rtmp download 'skip_download': True, }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) streamer = self._search_regex( r"'streamer'\s*,\S*'(rtmp[^']+)'", webpage, 'streamer') play_path = self._search_regex( r"'file'\s*,\s*'([^']+)'", webpage, 'file').rpartition('.')[0] def search_field(field_name, fatal=False): return self._search_regex( r'%s:\s*([^<]+)' % field_name, webpage, 'title', fatal=fatal) title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]') description = unescapeHTML(search_field('Description')) duration = parse_duration(search_field('Running time')) thumbnail = self._search_regex( r"'image'\s*,\s*'([^']+)'", webpage, 'thumbnails', fatal=False) return { 'id': video_id, 'url': streamer, 'play_path': play_path, 'ext': 'flv', 'title': title, 'description': description, 'duration': duration, 'thumbnail': thumbnail, } youtube-dl/youtube_dl/extractor/vodlocker.py0000644000000000000000000000534612646236655020424 0ustar rootroot# -*- coding: utf-8 -*- from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, NO_DEFAULT, sanitized_Request, ) class VodlockerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vodlocker\.(?:com|city)/(?:embed-)?(?P[0-9a-zA-Z]+)(?:\..*?)?' _TESTS = [{ 'url': 'http://vodlocker.com/e8wvyzz4sl42', 'md5': 'ce0c2d18fa0735f1bd91b69b0e54aacf', 'info_dict': { 'id': 'e8wvyzz4sl42', 'ext': 'mp4', 'title': 'Germany vs Brazil', 'thumbnail': 're:http://.*\.jpg', }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) if any(p in webpage for p in ( '>THIS FILE WAS DELETED<', '>File Not Found<', 'The file you were looking for could not be found, sorry for any inconvenience.<')): raise ExtractorError('Video %s does not exist' % video_id, expected=True) fields = self._hidden_inputs(webpage) if fields['op'] == 'download1': self._sleep(3, video_id) # they do detect when requests happen too fast! post = compat_urllib_parse.urlencode(fields) req = sanitized_Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') webpage = self._download_webpage( req, video_id, 'Downloading video page') def extract_file_url(html, default=NO_DEFAULT): return self._search_regex( r'file:\s*"(http[^\"]+)",', html, 'file url', default=default) video_url = extract_file_url(webpage, default=None) if not video_url: embed_url = self._search_regex( r']+src=(["\'])(?P(?:https?://)?vodlocker\.(?:com|city)/embed-.+?)\1', webpage, 'embed url', group='url') embed_webpage = self._download_webpage( embed_url, video_id, 'Downloading embed webpage') video_url = extract_file_url(embed_webpage) thumbnail_webpage = embed_webpage else: thumbnail_webpage = webpage title = self._search_regex( r'id="file_title".*?>\s*(.*?)\s*<(?:br|span)', webpage, 'title') thumbnail = self._search_regex( r'image:\s*"(http[^\"]+)",', thumbnail_webpage, 'thumbnail', fatal=False) formats = [{ 'format_id': 'sd', 'url': video_url, }] return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'formats': formats, } youtube-dl/youtube_dl/extractor/thisav.py0000644000000000000000000000307312641030331017677 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import determine_ext class ThisAVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P[0-9]+)/.*' _TEST = { 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html', 'md5': '0480f1ef3932d901f0e0e719f188f19b', 'info_dict': { 'id': '47734', 'ext': 'flv', 'title': '高樹マリア - Just fit', 'uploader': 'dj7970', 'uploader_id': 'dj7970' } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'

    ([^<]*)

    ', webpage, 'title') video_url = self._html_search_regex( r"addVariable\('file','([^']+)'\);", webpage, 'video url') uploader = self._html_search_regex( r': ([^<]+)', webpage, 'uploader name', fatal=False) uploader_id = self._html_search_regex( r': (?:[^<]+)', webpage, 'uploader id', fatal=False) ext = determine_ext(video_url) return { 'id': video_id, 'url': video_url, 'uploader': uploader, 'uploader_id': uploader_id, 'title': title, 'ext': ext, } youtube-dl/youtube_dl/extractor/mailru.py0000644000000000000000000001152112656435032017704 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, remove_end, ) class MailRuIE(InfoExtractor): IE_NAME = 'mailru' IE_DESC = 'Видео@Mail.Ru' _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P(?:[^/]+/){3}\d+)|(?:(?P(?:[^/]+/){2})video/(?P[^/]+/\d+))\.html)' _TESTS = [ { 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76', 'md5': 'dea205f03120046894db4ebb6159879a', 'info_dict': { 'id': '46301138_76', 'ext': 'mp4', 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', 'timestamp': 1393232740, 'upload_date': '20140224', 'uploader': 'sonypicturesrus', 'uploader_id': 'sonypicturesrus@mail.ru', 'duration': 184, }, 'skip': 'Not accessible from Travis CI server', }, { 'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html', 'md5': '00a91a58c3402204dcced523777b475f', 'info_dict': { 'id': '46843144_1263', 'ext': 'mp4', 'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion', 'timestamp': 1397039888, 'upload_date': '20140409', 'uploader': 'hitech@corp.mail.ru', 'uploader_id': 'hitech@corp.mail.ru', 'duration': 245, }, 'skip': 'Not accessible from Travis CI server', }, { # only available via metaUrl API 'url': 'http://my.mail.ru/mail/720pizle/video/_myvideo/502.html', 'md5': '3b26d2491c6949d031a32b96bd97c096', 'info_dict': { 'id': '56664382_502', 'ext': 'mp4', 'title': ':8336', 'timestamp': 1449094163, 'upload_date': '20151202', 'uploader': '720pizle@mail.ru', 'uploader_id': '720pizle@mail.ru', 'duration': 6001, }, 'skip': 'Not accessible from Travis CI server', } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('idv1') if not video_id: video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') webpage = self._download_webpage(url, video_id) video_data = None page_config = self._parse_json(self._search_regex( r'(?s)]+class="sp-video__page-config"[^>]*>(.+?)', webpage, 'page config', default='{}'), video_id, fatal=False) if page_config: meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') if meta_url: video_data = self._download_json( meta_url, video_id, 'Downloading video meta JSON', fatal=False) # Fallback old approach if not video_data: video_data = self._download_json( 'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON') formats = [] for f in video_data['videos']: video_url = f.get('url') if not video_url: continue format_id = f.get('key') height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None formats.append({ 'url': video_url, 'format_id': format_id, 'height': height, }) self._sort_formats(formats) meta_data = video_data['meta'] title = remove_end(meta_data['title'], '.mp4') author = video_data.get('author') uploader = author.get('name') uploader_id = author.get('id') or author.get('email') view_count = int_or_none(video_data.get('viewsCount') or video_data.get('views_count')) acc_id = meta_data.get('accId') item_id = meta_data.get('itemId') content_id = '%s_%s' % (acc_id, item_id) if acc_id and item_id else video_id thumbnail = meta_data.get('poster') duration = int_or_none(meta_data.get('duration')) timestamp = int_or_none(meta_data.get('timestamp')) return { 'id': content_id, 'title': title, 'thumbnail': thumbnail, 'timestamp': timestamp, 'uploader': uploader, 'uploader_id': uploader_id, 'duration': duration, 'view_count': view_count, 'formats': formats, } youtube-dl/youtube_dl/extractor/minhateca.py0000644000000000000000000000467612641030331020344 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( int_or_none, parse_duration, parse_filesize, sanitized_Request, ) class MinhatecaIE(InfoExtractor): _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P[0-9]+)\.' _TEST = { 'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)', 'info_dict': { 'id': '125848331', 'ext': 'mp4', 'title': 'youtube-dl test video', 'thumbnail': 're:^https?://.*\.jpg$', 'filesize_approx': 1530000, 'duration': 9, 'view_count': int, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) token = self._html_search_regex( r'(.*?)

    ', webpage, 'title') title, _, ext = title_str.rpartition('.') filesize_approx = parse_filesize(self._html_search_regex( r'

    (.*?)

    ', webpage, 'file size approximation', fatal=False)) duration = parse_duration(self._html_search_regex( r'(?s)

    .*?class="bold">(.*?)<', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( r'

    ([0-9]+)

    ', webpage, 'view count', fatal=False)) return { 'id': video_id, 'url': video_url, 'title': title, 'ext': ext, 'filesize_approx': filesize_approx, 'duration': duration, 'view_count': view_count, 'thumbnail': self._og_search_thumbnail(webpage), } youtube-dl/youtube_dl/extractor/telemb.py0000644000000000000000000000562412641030331017655 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import remove_start class TeleMBIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?telemb\.be/(?P.+?)_d_(?P\d+)\.html' _TESTS = [ { 'url': 'http://www.telemb.be/mons-cook-with-danielle-des-cours-de-cuisine-en-anglais-_d_13466.html', 'md5': 'f45ea69878516ba039835794e0f8f783', 'info_dict': { 'id': '13466', 'display_id': 'mons-cook-with-danielle-des-cours-de-cuisine-en-anglais-', 'ext': 'mp4', 'title': 'Mons - Cook with Danielle : des cours de cuisine en anglais ! - Les reportages', 'description': 'md5:bc5225f47b17c309761c856ad4776265', 'thumbnail': 're:^http://.*\.(?:jpg|png)$', } }, { # non-ASCII characters in download URL 'url': 'http://telemb.be/les-reportages-havre-incendie-mortel_d_13514.html', 'md5': '6e9682736e5ccd4eab7f21e855350733', 'info_dict': { 'id': '13514', 'display_id': 'les-reportages-havre-incendie-mortel', 'ext': 'mp4', 'title': 'Havré - Incendie mortel - Les reportages', 'description': 'md5:5e54cb449acb029c2b7734e2d946bd4a', 'thumbnail': 're:^http://.*\.(?:jpg|png)$', } }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) formats = [] for video_url in re.findall(r'file\s*:\s*"([^"]+)"', webpage): fmt = { 'url': video_url, 'format_id': video_url.split(':')[0] } rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?Pmp4:.+)$', video_url) if rtmp: fmt.update({ 'play_path': rtmp.group('playpath'), 'app': rtmp.group('app'), 'player_url': 'http://p.jwpcdn.com/6/10/jwplayer.flash.swf', 'page_url': 'http://www.telemb.be', 'preference': -1, }) formats.append(fmt) self._sort_formats(formats) title = remove_start(self._og_search_title(webpage), 'TéléMB : ') description = self._html_search_regex( r'', webpage, 'description', fatal=False) thumbnail = self._og_search_thumbnail(webpage) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'formats': formats, } youtube-dl/youtube_dl/extractor/lrt.py0000644000000000000000000000425212641030331017202 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, parse_duration, remove_end, ) class LRTIE(InfoExtractor): IE_NAME = 'lrt.lt' _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P[0-9]+)' _TEST = { 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', 'info_dict': { 'id': '54391', 'ext': 'mp4', 'title': 'Septynios Kauno dienos', 'description': 'md5:24d84534c7dc76581e59f5689462411a', 'duration': 1783, 'view_count': int, 'like_count': int, }, 'params': { 'skip_download': True, # m3u8 download }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' - LRT') m3u8_url = self._search_regex( r'file\s*:\s*(["\'])(?P.+?)\1\s*\+\s*location\.hash\.substring\(1\)', webpage, 'm3u8 url', group='url') formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage) duration = parse_duration(self._search_regex( r'var\s+record_len\s*=\s*(["\'])(?P[0-9]+:[0-9]+:[0-9]+)\1', webpage, 'duration', default=None, group='duration')) view_count = int_or_none(self._html_search_regex( r']+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P.+?)', webpage, 'view count', fatal=False, group='count')) like_count = int_or_none(self._search_regex( r']+id=(["\'])flikesCount.*?\1>(?P\d+)<', webpage, 'like count', fatal=False, group='count')) return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'description': description, 'duration': duration, 'view_count': view_count, 'like_count': like_count, } youtube-dl/youtube_dl/extractor/lemonde.py0000644000000000000000000000253012650650456020041 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor class LemondeIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?lemonde\.fr/(?:[^/]+/)*(?P[^/]+)\.html' _TESTS = [{ 'url': 'http://www.lemonde.fr/police-justice/video/2016/01/19/comprendre-l-affaire-bygmalion-en-cinq-minutes_4849702_1653578.html', 'md5': '01fb3c92de4c12c573343d63e163d302', 'info_dict': { 'id': 'lqm3kl', 'ext': 'mp4', 'title': "Comprendre l'affaire Bygmalion en 5 minutes", 'thumbnail': 're:^https?://.*\.jpg', 'duration': 320, 'upload_date': '20160119', 'timestamp': 1453194778, 'uploader_id': '3pmkp', }, }, { 'url': 'http://redaction.actu.lemonde.fr/societe/video/2016/01/18/calais-debut-des-travaux-de-defrichement-dans-la-jungle_4849233_3224.html', 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) digiteka_url = self._proto_relative_url(self._search_regex( r'url\s*:\s*(["\'])(?P(?:https?://)?//(?:www\.)?(?:digiteka\.net|ultimedia\.com)/deliver/.+?)\1', webpage, 'digiteka url', group='url')) return self.url_result(digiteka_url, 'Digiteka') youtube-dl/youtube_dl/extractor/ccc.py0000644000000000000000000001072612660177411017150 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, parse_duration, qualities, unified_strdate, ) class CCCIE(InfoExtractor): IE_NAME = 'media.ccc.de' _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/v/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video', 'md5': '3a1eda8f3a29515d27f5adb967d7e740', 'info_dict': { 'id': '30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor', 'ext': 'mp4', 'title': 'Introduction to Processor Design', 'description': 'md5:80be298773966f66d56cb11260b879af', 'thumbnail': 're:^https?://.*\.jpg$', 'view_count': int, 'upload_date': '20131228', 'duration': 3660, } }, { 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) if self._downloader.params.get('prefer_free_formats'): preference = qualities(['mp3', 'opus', 'mp4-lq', 'webm-lq', 'h264-sd', 'mp4-sd', 'webm-sd', 'mp4', 'webm', 'mp4-hd', 'h264-hd', 'webm-hd']) else: preference = qualities(['opus', 'mp3', 'webm-lq', 'mp4-lq', 'webm-sd', 'h264-sd', 'mp4-sd', 'webm', 'mp4', 'webm-hd', 'mp4-hd', 'h264-hd']) title = self._html_search_regex( r'(?s)

    (.*?)

    ', webpage, 'title') description = self._html_search_regex( r'(?s)

    About

    (.+?)

    ', webpage, 'description', fatal=False) upload_date = unified_strdate(self._html_search_regex( r"(?s)]+class='[^']*fa-calendar-o'[^>]*>(.+?)", webpage, 'upload date', fatal=False)) view_count = int_or_none(self._html_search_regex( r"(?s)(.*?)", webpage, 'view count', fatal=False)) duration = parse_duration(self._html_search_regex( r'(?s)]+class=(["\']).*?fa-clock-o.*?\1[^>]*>(?P.+?)(?P[^<]*)\s* <(?:span|div)\s+class='label\s+filetype'>(?P[^<]*)\s* [^']+)'>\s* (?: .*? [^']+\.torrent)' )?''', webpage) formats = [] for m in matches: format = m.group('format') format_id = self._search_regex( r'.*/([a-z0-9_-]+)/[^/]*$', m.group('http_url'), 'format id', default=None) if format_id: format_id = m.group('lang') + '-' + format_id vcodec = 'h264' if 'h264' in format_id else ( 'none' if format_id in ('mp3', 'opus') else None ) formats.append({ 'format_id': format_id, 'format': format, 'language': m.group('lang'), 'url': m.group('http_url'), 'vcodec': vcodec, 'preference': preference(format_id), }) if m.group('torrent_url'): formats.append({ 'format_id': 'torrent-%s' % (format if format_id is None else format_id), 'format': '%s (torrent)' % format, 'proto': 'torrent', 'format_note': '(unsupported; will just download the .torrent file)', 'vcodec': vcodec, 'preference': -100 + preference(format_id), 'url': m.group('torrent_url'), }) self._sort_formats(formats) thumbnail = self._html_search_regex( r"\d+)' _TESTS = [{ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', 'md5': '4d7d549451bad625e0ff3d7bd56d776c', 'info_dict': { 'id': '4629301', 'ext': 'mp4', 'title': 'Brick Briscoe', 'duration': 612, 'thumbnail': 're:^https?://.+\.jpg', }, }, { 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) js_player = self._download_webpage( 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, video_id, 'Downlaoding player') # it includes a required token flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars') pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, video_id, 'Downloading video info', transform_source=fix_xml_ampersands) track_doc = pdoc.find('trackList/track') def find_param(name): node = find_xpath_attr(track_doc, './/param', 'name', name) if node is not None: return node.attrib['value'] return { 'id': video_id, 'title': find_param('title'), 'url': track_doc.find('location').text, 'thumbnail': find_param('thumbnail'), 'duration': int(find_param('duration')), } youtube-dl/youtube_dl/extractor/teachingchannel.py0000644000000000000000000000200012641030331021501 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from .ooyala import OoyalaIE class TeachingChannelIE(InfoExtractor): _VALID_URL = r'https?://www\.teachingchannel\.org/videos/(?P.+)' _TEST = { 'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution', 'info_dict': { 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', 'ext': 'mp4', 'title': 'A History of Teaming', 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', 'duration': 422.255, }, 'params': { # m3u8 download 'skip_download': True, }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) ooyala_code = self._search_regex( r'data-embed-code=\'(.+?)\'', webpage, 'ooyala code') return OoyalaIE._build_url_result(ooyala_code) youtube-dl/youtube_dl/extractor/ctsnews.py����������������������������������������������������������0000644�0000000�0000000�00000007131�12641030331�020066� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# -*- coding: utf-8 -*- from __future__ import unicode_literals from .common import InfoExtractor from ..utils import parse_iso8601, ExtractorError class CtsNewsIE(InfoExtractor): IE_DESC = '華視新聞' # https connection failed (Connection reset) _VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html' _TESTS = [{ 'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html', 'md5': 'a9875cb790252b08431186d741beaabe', 'info_dict': { 'id': '201501291578109', 'ext': 'mp4', 'title': '以色列.真主黨交火 3人死亡', 'description': 'md5:95e9b295c898b7ff294f09d450178d7d', 'timestamp': 1422528540, 'upload_date': '20150129', } }, { # News count not appear on page but still available in database 'url': 'http://news.cts.com.tw/cts/international/201309/201309031304098.html', 'md5': '3aee7e0df7cdff94e43581f54c22619e', 'info_dict': { 'id': '201309031304098', 'ext': 'mp4', 'title': '韓國31歲童顏男 貌如十多歲小孩', 'description': 'md5:f183feeba3752b683827aab71adad584', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1378205880, 'upload_date': '20130903', } }, { # With Youtube embedded video 'url': 'http://news.cts.com.tw/cts/money/201501/201501291578003.html', 'md5': '1d842c771dc94c8c3bca5af2cc1db9c5', 'add_ie': ['Youtube'], 'info_dict': { 'id': 'OVbfO7d0_hQ', 'ext': 'mp4', 'title': 'iPhone6熱銷 蘋果財報亮眼', 'description': 'md5:f395d4f485487bb0f992ed2c4b07aa7d', 'thumbnail': 're:^https?://.*\.jpg$', 'upload_date': '20150128', 'uploader_id': 'TBSCTS', 'uploader': '中華電視公司', } }] def _real_extract(self, url): news_id = self._match_id(url) page = self._download_webpage(url, news_id) if self._search_regex(r'(CTSPlayer2)', page, 'CTSPlayer2 identifier', default=None): feed_url = self._html_search_regex( r'(http://news\.cts\.com\.tw/action/mp4feed\.php\?news_id=\d+)', page, 'feed url') video_url = self._download_webpage( feed_url, news_id, note='Fetching feed') else: self.to_screen('Not CTSPlayer video, trying Youtube...') youtube_url = self._search_regex( r'src="(//www\.youtube\.com/embed/[^"]+)"', page, 'youtube url', default=None) if not youtube_url: raise ExtractorError('The news includes no videos!', expected=True) return { '_type': 'url', 'url': youtube_url, 'ie_key': 'Youtube', } description = self._html_search_meta('description', page) title = self._html_search_meta('title', page) thumbnail = self._html_search_meta('image', page) datetime_str = self._html_search_regex( r'(\d{4}/\d{2}/\d{2} \d{2}:\d{2})', page, 'date and time') # Transform into ISO 8601 format with timezone info datetime_str = datetime_str.replace('/', '-') + ':00+0800' timestamp = parse_iso8601(datetime_str, delimiter=' ') return { 'id': news_id, 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/adobetv.py����������������������������������������������������������0000644�0000000�0000000�00000016104�12641030331�020024� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( parse_duration, unified_strdate, str_to_int, int_or_none, float_or_none, ISO639Utils, determine_ext, ) class AdobeTVBaseIE(InfoExtractor): _API_BASE_URL = 'http://tv.adobe.com/api/v4/' class AdobeTVIE(AdobeTVBaseIE): _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?watch/(?P<show_urlname>[^/]+)/(?P<id>[^/]+)' _TEST = { 'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/', 'md5': '9bc5727bcdd55251f35ad311ca74fa1e', 'info_dict': { 'id': '10981', 'ext': 'mp4', 'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop', 'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311', 'thumbnail': 're:https?://.*\.jpg$', 'upload_date': '20110914', 'duration': 60, 'view_count': int, }, } def _real_extract(self, url): language, show_urlname, urlname = re.match(self._VALID_URL, url).groups() if not language: language = 'en' video_data = self._download_json( self._API_BASE_URL + 'episode/get/?language=%s&show_urlname=%s&urlname=%s&disclosure=standard' % (language, show_urlname, urlname), urlname)['data'][0] formats = [{ 'url': source['url'], 'format_id': source.get('quality_level') or source['url'].split('-')[-1].split('.')[0] or None, 'width': int_or_none(source.get('width')), 'height': int_or_none(source.get('height')), 'tbr': int_or_none(source.get('video_data_rate')), } for source in video_data['videos']] self._sort_formats(formats) return { 'id': compat_str(video_data['id']), 'title': video_data['title'], 'description': video_data.get('description'), 'thumbnail': video_data.get('thumbnail'), 'upload_date': unified_strdate(video_data.get('start_date')), 'duration': parse_duration(video_data.get('duration')), 'view_count': str_to_int(video_data.get('playcount')), 'formats': formats, } class AdobeTVPlaylistBaseIE(AdobeTVBaseIE): def _parse_page_data(self, page_data): return [self.url_result(self._get_element_url(element_data)) for element_data in page_data] def _extract_playlist_entries(self, url, display_id): page = self._download_json(url, display_id) entries = self._parse_page_data(page['data']) for page_num in range(2, page['paging']['pages'] + 1): entries.extend(self._parse_page_data( self._download_json(url + '&page=%d' % page_num, display_id)['data'])) return entries class AdobeTVShowIE(AdobeTVPlaylistBaseIE): _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?show/(?P<id>[^/]+)' _TEST = { 'url': 'http://tv.adobe.com/show/the-complete-picture-with-julieanne-kost', 'info_dict': { 'id': '36', 'title': 'The Complete Picture with Julieanne Kost', 'description': 'md5:fa50867102dcd1aa0ddf2ab039311b27', }, 'playlist_mincount': 136, } def _get_element_url(self, element_data): return element_data['urls'][0] def _real_extract(self, url): language, show_urlname = re.match(self._VALID_URL, url).groups() if not language: language = 'en' query = 'language=%s&show_urlname=%s' % (language, show_urlname) show_data = self._download_json(self._API_BASE_URL + 'show/get/?%s' % query, show_urlname)['data'][0] return self.playlist_result( self._extract_playlist_entries(self._API_BASE_URL + 'episode/?%s' % query, show_urlname), compat_str(show_data['id']), show_data['show_name'], show_data['show_description']) class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?channel/(?P<id>[^/]+)(?:/(?P<category_urlname>[^/]+))?' _TEST = { 'url': 'http://tv.adobe.com/channel/development', 'info_dict': { 'id': 'development', }, 'playlist_mincount': 96, } def _get_element_url(self, element_data): return element_data['url'] def _real_extract(self, url): language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups() if not language: language = 'en' query = 'language=%s&channel_urlname=%s' % (language, channel_urlname) if category_urlname: query += '&category_urlname=%s' % category_urlname return self.playlist_result( self._extract_playlist_entries(self._API_BASE_URL + 'show/?%s' % query, channel_urlname), channel_urlname) class AdobeTVVideoIE(InfoExtractor): _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)' _TEST = { # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners 'url': 'https://video.tv.adobe.com/v/2456/', 'md5': '43662b577c018ad707a63766462b1e87', 'info_dict': { 'id': '2456', 'ext': 'mp4', 'title': 'New experience with Acrobat DC', 'description': 'New experience with Acrobat DC', 'duration': 248.667, }, } def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json(url + '?format=json', video_id) formats = [{ 'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')), 'url': source['src'], 'width': int_or_none(source.get('width')), 'height': int_or_none(source.get('height')), 'tbr': int_or_none(source.get('bitrate')), } for source in video_data['sources']] self._sort_formats(formats) # For both metadata and downloaded files the duration varies among # formats. I just pick the max one duration = max(filter(None, [ float_or_none(source.get('duration'), scale=1000) for source in video_data['sources']])) subtitles = {} for translation in video_data.get('translations', []): lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) if lang_id not in subtitles: subtitles[lang_id] = [] subtitles[lang_id].append({ 'url': translation['vttPath'], 'ext': 'vtt', }) return { 'id': video_id, 'formats': formats, 'title': video_data['title'], 'description': video_data.get('description'), 'thumbnail': video_data['video'].get('poster'), 'duration': duration, 'subtitles': subtitles, } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/nrk.py��������������������������������������������������������������0000644�0000000�0000000�00000025205�12653373215�017212� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( determine_ext, ExtractorError, float_or_none, parse_duration, unified_strdate, ) class NRKIE(InfoExtractor): _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' _TESTS = [ { 'url': 'http://www.nrk.no/video/PS*150533', 'md5': 'bccd850baebefe23b56d708a113229c2', 'info_dict': { 'id': '150533', 'ext': 'flv', 'title': 'Dompap og andre fugler i Piip-Show', 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', 'duration': 263, } }, { 'url': 'http://www.nrk.no/video/PS*154915', 'md5': '0b1493ba1aae7d9579a5ad5531bc395a', 'info_dict': { 'id': '154915', 'ext': 'flv', 'title': 'Slik høres internett ut når du er blind', 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', 'duration': 20, } }, ] def _real_extract(self, url): video_id = self._match_id(url) data = self._download_json( 'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON') media_url = data.get('mediaUrl') if not media_url: if data['usageRights']['isGeoBlocked']: raise ExtractorError( 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', expected=True) if determine_ext(media_url) == 'f4m': formats = self._extract_f4m_formats( media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds') else: formats = [{ 'url': media_url, 'ext': 'flv', }] duration = parse_duration(data.get('duration')) images = data.get('images') if images: thumbnails = images['webImages'] thumbnails.sort(key=lambda image: image['pixelWidth']) thumbnail = thumbnails[-1]['imageUrl'] else: thumbnail = None return { 'id': video_id, 'title': data['title'], 'description': data['description'], 'duration': duration, 'thumbnail': thumbnail, 'formats': formats, } class NRKPlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', 'info_dict': { 'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763', 'title': 'Gjenopplev den historiske solformørkelsen', 'description': 'md5:c2df8ea3bac5654a26fc2834a542feed', }, 'playlist_count': 2, }, { 'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449', 'info_dict': { 'id': 'rivertonprisen-til-karin-fossum-1.12266449', 'title': 'Rivertonprisen til Karin Fossum', 'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', }, 'playlist_count': 5, }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) entries = [ self.url_result('nrk:%s' % video_id, 'NRK') for video_id in re.findall( r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"', webpage) ] playlist_title = self._og_search_title(webpage) playlist_description = self._og_search_description(webpage) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' _TESTS = [ { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'info_dict': { 'id': 'MUHH48000314', 'ext': 'mp4', 'title': '20 spørsmål', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'upload_date': '20140523', 'duration': 1741.52, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { 'id': 'mdfp15000514', 'ext': 'mp4', 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting', 'description': 'md5:654c12511f035aed1e42bdf5db3b206a', 'upload_date': '20140524', 'duration': 4605.08, }, 'params': { # m3u8 download 'skip_download': True, }, }, { # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'md5': 'adbd1dbd813edaf532b0a253780719c2', 'info_dict': { 'id': 'MSPO40010515-part2', 'ext': 'flv', 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', 'upload_date': '20150106', }, 'skip': 'Only works from Norway', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'playlist': [ { 'md5': '9480285eff92d64f06e02a5367970a7a', 'info_dict': { 'id': 'MSPO40010515-part1', 'ext': 'flv', 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', 'upload_date': '20150106', }, }, { 'md5': 'adbd1dbd813edaf532b0a253780719c2', 'info_dict': { 'id': 'MSPO40010515-part2', 'ext': 'flv', 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', 'upload_date': '20150106', }, }, ], 'info_dict': { 'id': 'MSPO40010515', 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', 'upload_date': '20150106', 'duration': 6947.5199999999995, }, 'skip': 'Only works from Norway', }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, } ] def _extract_f4m(self, manifest_url, video_id): return self._extract_f4m_formats( manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') part_id = mobj.group('part_id') base_url = mobj.group('baseurl') webpage = self._download_webpage(url, video_id) title = self._html_search_meta( 'title', webpage, 'title') description = self._html_search_meta( 'description', webpage, 'description') thumbnail = self._html_search_regex( r'data-posterimage="([^"]+)"', webpage, 'thumbnail', fatal=False) upload_date = unified_strdate(self._html_search_meta( 'rightsfrom', webpage, 'upload date', fatal=False)) duration = float_or_none(self._html_search_regex( r'data-duration="([^"]+)"', webpage, 'duration', fatal=False)) # playlist parts = re.findall( r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage) if parts: entries = [] for current_part_id, stream_url, part_title in parts: if part_id and current_part_id != part_id: continue video_part_id = '%s-part%s' % (video_id, current_part_id) formats = self._extract_f4m(stream_url, video_part_id) entries.append({ 'id': video_part_id, 'title': part_title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'formats': formats, }) if part_id: if entries: return entries[0] else: playlist = self.playlist_result(entries, video_id, title, description) playlist.update({ 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, }) return playlist formats = [] f4m_url = re.search(r'data-media="([^"]+)"', webpage) if f4m_url: formats.extend(self._extract_f4m(f4m_url.group(1), video_id)) m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage) if m3u8_url: formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls')) self._sort_formats(formats) subtitles_url = self._html_search_regex( r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1', webpage, 'subtitle URL', default=None, group='url') subtitles = {} if subtitles_url: subtitles['no'] = [{ 'ext': 'ttml', 'url': compat_urlparse.urljoin(base_url, subtitles_url), }] return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'formats': formats, 'subtitles': subtitles, } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/videomega.py��������������������������������������������������������0000644�0000000�0000000�00000003604�12645665720�020365� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import sanitized_Request class VideoMegaIE(InfoExtractor): _WORKING = False _VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA', 'md5': 'cc1920a58add3f05c6a93285b84fb3aa', 'info_dict': { 'id': 'AOSQBJYKIDDIKYJBQSOA', 'ext': 'mp4', 'title': '1254207', 'thumbnail': 're:^https?://.*\.jpg$', } }, { 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA&width=1070&height=600', 'only_matching': True, }, { 'url': 'http://videomega.tv/view.php?ref=090051111052065112106089103052052103089106112065052111051090', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id req = sanitized_Request(iframe_url) req.add_header('Referer', url) req.add_header('Cookie', 'noadvtday=0') webpage = self._download_webpage(req, video_id) title = self._html_search_regex( r'<title>(.+?)', webpage, 'title') title = re.sub( r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title) thumbnail = self._search_regex( r']+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False) video_url = self._search_regex( r']+?src="([^"]+)"', webpage, 'video URL') return { 'id': video_id, 'title': title, 'url': video_url, 'thumbnail': thumbnail, 'http_headers': { 'Referer': iframe_url, }, } youtube-dl/youtube_dl/extractor/limelight.py0000644000000000000000000002131212653373215020371 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( determine_ext, float_or_none, int_or_none, ) class LimelightBaseIE(InfoExtractor): _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' def _call_playlist_service(self, item_id, method, fatal=True): return self._download_json( self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal) def _call_api(self, organization_id, item_id, method): return self._download_json( self._API_URL % (organization_id, self._API_PATH, item_id, method), item_id, 'Downloading API %s JSON' % method) def _extract(self, item_id, pc_method, mobile_method, meta_method): pc = self._call_playlist_service(item_id, pc_method) metadata = self._call_api(pc['orgId'], item_id, meta_method) mobile = self._call_playlist_service(item_id, mobile_method, fatal=False) return pc, mobile, metadata def _extract_info(self, streams, mobile_urls, properties): video_id = properties['media_id'] formats = [] for stream in streams: stream_url = stream.get('url') if not stream_url: continue if '.f4m' in stream_url: formats.extend(self._extract_f4m_formats( stream_url, video_id, fatal=False)) else: fmt = { 'url': stream_url, 'abr': float_or_none(stream.get('audioBitRate')), 'vbr': float_or_none(stream.get('videoBitRate')), 'fps': float_or_none(stream.get('videoFrameRate')), 'width': int_or_none(stream.get('videoWidthInPixels')), 'height': int_or_none(stream.get('videoHeightInPixels')), 'ext': determine_ext(stream_url) } rtmp = re.search(r'^(?Prtmpe?://[^/]+/(?P.+))/(?Pmp4:.+)$', stream_url) if rtmp: format_id = 'rtmp' if stream.get('videoBitRate'): format_id += '-%d' % int_or_none(stream['videoBitRate']) fmt.update({ 'url': rtmp.group('url'), 'play_path': rtmp.group('playpath'), 'app': rtmp.group('app'), 'ext': 'flv', 'format_id': format_id, }) formats.append(fmt) for mobile_url in mobile_urls: media_url = mobile_url.get('mobileUrl') if not media_url: continue format_id = mobile_url.get('targetMediaPlatform') if determine_ext(media_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) else: formats.append({ 'url': media_url, 'format_id': format_id, 'preference': -1, }) self._sort_formats(formats) title = properties['title'] description = properties.get('description') timestamp = int_or_none(properties.get('publish_date') or properties.get('create_date')) duration = float_or_none(properties.get('duration_in_milliseconds'), 1000) filesize = int_or_none(properties.get('total_storage_in_bytes')) categories = [properties.get('category')] tags = properties.get('tags', []) thumbnails = [{ 'url': thumbnail['url'], 'width': int_or_none(thumbnail.get('width')), 'height': int_or_none(thumbnail.get('height')), } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')] subtitles = {} for caption in properties.get('captions', {}): lang = caption.get('language_code') subtitles_url = caption.get('url') if lang and subtitles_url: subtitles[lang] = [{ 'url': subtitles_url, }] return { 'id': video_id, 'title': title, 'description': description, 'formats': formats, 'timestamp': timestamp, 'duration': duration, 'filesize': filesize, 'categories': categories, 'tags': tags, 'thumbnails': thumbnails, 'subtitles': subtitles, } class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P[a-z0-9]{32})' _TESTS = [{ 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', 'info_dict': { 'id': '3ffd040b522b4485b6d84effc750cd86', 'ext': 'flv', 'title': 'HaP and the HB Prince Trailer', 'description': 'md5:8005b944181778e313d95c1237ddb640', 'thumbnail': 're:^https?://.*\.jpeg$', 'duration': 144.23, 'timestamp': 1244136834, 'upload_date': '20090604', }, 'params': { # rtmp download 'skip_download': True, }, }, { # video with subtitles 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335', 'info_dict': { 'id': 'a3e00274d4564ec4a9b29b9466432335', 'ext': 'flv', 'title': '3Play Media Overview Video', 'description': '', 'thumbnail': 're:^https?://.*\.jpeg$', 'duration': 78.101, 'timestamp': 1338929955, 'upload_date': '20120605', 'subtitles': 'mincount:9', }, 'params': { # rtmp download 'skip_download': True, }, }] _PLAYLIST_SERVICE_PATH = 'media' _API_PATH = 'media' def _real_extract(self, url): video_id = self._match_id(url) pc, mobile, metadata = self._extract( video_id, 'getPlaylistByMediaId', 'getMobilePlaylistByMediaId', 'properties') return self._extract_info( pc['playlistItems'][0].get('streams', []), mobile['mediaList'][0].get('mobileUrls', []) if mobile else [], metadata) class LimelightChannelIE(LimelightBaseIE): IE_NAME = 'limelight:channel' _VALID_URL = r'(?:limelight:channel:|http://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082', 'info_dict': { 'id': 'ab6a524c379342f9b23642917020c082', 'title': 'Javascript Sample Code', }, 'playlist_mincount': 3, } _PLAYLIST_SERVICE_PATH = 'channel' _API_PATH = 'channels' def _real_extract(self, url): channel_id = self._match_id(url) pc, mobile, medias = self._extract( channel_id, 'getPlaylistByChannelId', 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', 'media') entries = [ self._extract_info( pc['playlistItems'][i].get('streams', []), mobile['mediaList'][i].get('mobileUrls', []) if mobile else [], medias['media_list'][i]) for i in range(len(medias['media_list']))] return self.playlist_result(entries, channel_id, pc['title']) class LimelightChannelListIE(LimelightBaseIE): IE_NAME = 'limelight:channel_list' _VALID_URL = r'(?:limelight:channel_list:|http://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b', 'info_dict': { 'id': '301b117890c4465c8179ede21fd92e2b', 'title': 'Website - Hero Player', }, 'playlist_mincount': 2, } _PLAYLIST_SERVICE_PATH = 'channel_list' def _real_extract(self, url): channel_list_id = self._match_id(url) channel_list = self._call_playlist_service(channel_list_id, 'getMobileChannelListById') entries = [ self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel') for channel in channel_list['channelList']] return self.playlist_result(entries, channel_list_id, channel_list['title']) youtube-dl/youtube_dl/extractor/rds.py0000644000000000000000000000526412641030331017175 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, ) class RDSIE(InfoExtractor): IE_DESC = 'RDS.ca' _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P[^/]+)-(?P\d+\.\d+)' _TESTS = [{ 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', 'info_dict': { 'id': '3.1132799', 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', 'ext': 'mp4', 'title': 'Fowler Jr. prend la direction de Jacksonville', 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', 'timestamp': 1430397346, 'upload_date': '20150430', 'duration': 154.354, 'age_limit': 0, } }, { 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) # TODO: extract f4m from 9c9media.com video_url = self._search_regex( r']+itemprop="contentURL"[^>]+content="([^"]+)"', webpage, 'video url') title = self._og_search_title(webpage) or self._html_search_meta( 'title', webpage, 'title', fatal=True) description = self._og_search_description(webpage) or self._html_search_meta( 'description', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) or self._search_regex( [r']+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', r']+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], webpage, 'thumbnail', fatal=False) timestamp = parse_iso8601(self._search_regex( r']+itemprop="uploadDate"[^>]+content="([^"]+)"', webpage, 'upload date', fatal=False)) duration = parse_duration(self._search_regex( r']+itemprop="duration"[^>]+content="([^"]+)"', webpage, 'duration', fatal=False)) age_limit = self._family_friendly_search(webpage) return { 'id': video_id, 'display_id': display_id, 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, 'age_limit': age_limit, } youtube-dl/youtube_dl/extractor/philharmoniedeparis.py0000644000000000000000000000521012641030331022423 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( float_or_none, int_or_none, parse_iso8601, xpath_text, ) class PhilharmonieDeParisIE(InfoExtractor): IE_DESC = 'Philharmonie de Paris' _VALID_URL = r'http://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P\d+)' _TESTS = [{ 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', 'info_dict': { 'id': '1032066', 'ext': 'flv', 'title': 'md5:d1f5585d87d041d07ce9434804bc8425', 'timestamp': 1428179400, 'upload_date': '20150404', 'duration': 6592.278, }, 'params': { # rtmp download 'skip_download': True, } }, { 'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html', 'only_matching': True, }, { 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) concert = self._download_xml( 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=%s' % video_id, video_id).find('./concert') formats = [] info_dict = { 'id': video_id, 'title': xpath_text(concert, './titre', 'title', fatal=True), 'formats': formats, } fichiers = concert.find('./fichiers') stream = fichiers.attrib['serveurstream'] for fichier in fichiers.findall('./fichier'): info_dict['duration'] = float_or_none(fichier.get('timecodefin')) for quality, (format_id, suffix) in enumerate([('lq', ''), ('hq', '_hd')]): format_url = fichier.get('url%s' % suffix) if not format_url: continue formats.append({ 'url': stream, 'play_path': format_url, 'ext': 'flv', 'format_id': format_id, 'width': int_or_none(concert.get('largeur%s' % suffix)), 'height': int_or_none(concert.get('hauteur%s' % suffix)), 'quality': quality, }) self._sort_formats(formats) date, hour = concert.get('date'), concert.get('heure') if date and hour: info_dict['timestamp'] = parse_iso8601( '%s-%s-%sT%s:00' % (date[0:4], date[4:6], date[6:8], hour)) elif date: info_dict['upload_date'] = date return info_dict youtube-dl/youtube_dl/extractor/defense.py0000644000000000000000000000233012641030331020005 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor class DefenseGouvFrIE(InfoExtractor): IE_NAME = 'defense.gouv.fr' _VALID_URL = r'http://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P[^/?#]*)' _TEST = { 'url': 'http://www.defense.gouv.fr/layout/set/ligthboxvideo/base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1', 'md5': '75bba6124da7e63d2d60b5244ec9430c', 'info_dict': { 'id': '11213', 'ext': 'mp4', 'title': 'attaque-chimique-syrienne-du-21-aout-2013-1' } } def _real_extract(self, url): title = self._match_id(url) webpage = self._download_webpage(url, title) video_id = self._search_regex( r"flashvars.pvg_id=\"(\d+)\";", webpage, 'ID') json_url = ( 'http://static.videos.gouv.fr/brightcovehub/export/json/%s' % video_id) info = self._download_json(json_url, title, 'Downloading JSON config') video_url = info['renditions'][0]['url'] return { 'id': video_id, 'ext': 'mp4', 'url': video_url, 'title': title, } youtube-dl/youtube_dl/extractor/soundcloud.py0000644000000000000000000004544212660177411020602 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re import itertools from .common import ( InfoExtractor, SearchInfoExtractor ) from ..compat import ( compat_str, compat_urlparse, compat_urllib_parse, ) from ..utils import ( encode_dict, ExtractorError, int_or_none, unified_strdate, ) class SoundcloudIE(InfoExtractor): """Information extractor for soundcloud.com To access the media, the uid of the song and a stream token must be extracted from the page source and the script must make a request to media.soundcloud.com/crossdomain.xml. Then the media can be grabbed by requesting from an url composed of the stream token and uid """ _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P[\w\d-]+)/ (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) (?:/?\?secret_token=(?P<secret_token>[^&]+))?) |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*) ) ''' IE_NAME = 'soundcloud' _TESTS = [ { 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', 'md5': 'ebef0a451b909710ed1d7787dddbf0d7', 'info_dict': { 'id': '62986583', 'ext': 'mp3', 'upload_date': '20121011', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'uploader': 'E.T. ExTerrestrial Music', 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'duration': 143, } }, # not streamable song { 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'info_dict': { 'id': '47127627', 'ext': 'mp3', 'title': 'Goldrushed', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'uploader': 'The Royal Concept', 'upload_date': '20120521', 'duration': 227, }, 'params': { # rtmp 'skip_download': True, }, }, # private link { 'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp', 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', 'info_dict': { 'id': '123998367', 'ext': 'mp3', 'title': 'Youtube - Dl Test Video \'\' Ä↭', 'uploader': 'jaimeMF', 'description': 'test chars: \"\'/\\ä↭', 'upload_date': '20131209', 'duration': 9, }, }, # private link (alt format) { 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp', 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', 'info_dict': { 'id': '123998367', 'ext': 'mp3', 'title': 'Youtube - Dl Test Video \'\' Ä↭', 'uploader': 'jaimeMF', 'description': 'test chars: \"\'/\\ä↭', 'upload_date': '20131209', 'duration': 9, }, }, # downloadable song { 'url': 'https://soundcloud.com/oddsamples/bus-brakes', 'md5': '7624f2351f8a3b2e7cd51522496e7631', 'info_dict': { 'id': '128590877', 'ext': 'mp3', 'title': 'Bus Brakes', 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', 'uploader': 'oddsamples', 'upload_date': '20140109', 'duration': 17, }, }, ] _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' def report_resolve(self, video_id): """Report information extraction.""" self.to_screen('%s: Resolving id' % video_id) @classmethod def _resolv_url(cls, url): return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): track_id = compat_str(info['id']) name = full_title or track_id if quiet: self.report_extraction(name) thumbnail = info['artwork_url'] if thumbnail is not None: thumbnail = thumbnail.replace('-large', '-t500x500') ext = 'mp3' result = { 'id': track_id, 'uploader': info['user']['username'], 'upload_date': unified_strdate(info['created_at']), 'title': info['title'], 'description': info['description'], 'thumbnail': thumbnail, 'duration': int_or_none(info.get('duration'), 1000), 'webpage_url': info.get('permalink_url'), } formats = [] if info.get('downloadable', False): # We can build a direct link to the song format_url = ( 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( track_id, self._CLIENT_ID)) formats.append({ 'format_id': 'download', 'ext': info.get('original_format', 'mp3'), 'url': format_url, 'vcodec': 'none', 'preference': 10, }) # We have to retrieve the url streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?' 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token)) format_dict = self._download_json( streams_url, track_id, 'Downloading track url') for key, stream_url in format_dict.items(): if key.startswith('http'): formats.append({ 'format_id': key, 'ext': ext, 'url': stream_url, 'vcodec': 'none', }) elif key.startswith('rtmp'): # The url doesn't have an rtmp app, we have to extract the playpath url, path = stream_url.split('mp3:', 1) formats.append({ 'format_id': key, 'url': url, 'play_path': 'mp3:' + path, 'ext': 'flv', 'vcodec': 'none', }) if not formats: # We fallback to the stream_url in the original info, this # cannot be always used, sometimes it can give an HTTP 404 error formats.append({ 'format_id': 'fallback', 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, 'ext': ext, 'vcodec': 'none', }) for f in formats: if f['format_id'].startswith('http'): f['protocol'] = 'http' if f['format_id'].startswith('rtmp'): f['protocol'] = 'rtmp' self._check_formats(formats, track_id) self._sort_formats(formats) result['formats'] = formats return result def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) if mobj is None: raise ExtractorError('Invalid URL: %s' % url) track_id = mobj.group('track_id') token = None if track_id is not None: info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID full_title = track_id token = mobj.group('secret_token') if token: info_json_url += '&secret_token=' + token elif mobj.group('player'): query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) real_url = query['url'][0] # If the token is in the query of the original url we have to # manually add it if 'secret_token' in query: real_url += '?secret_token=' + query['secret_token'][0] return self.url_result(real_url) else: # extract uploader (which is in the url) uploader = mobj.group('uploader') # extract simple title (uploader + slug of song title) slug_title = mobj.group('title') token = mobj.group('token') full_title = resolve_title = '%s/%s' % (uploader, slug_title) if token: resolve_title += '/%s' % token self.report_resolve(full_title) url = 'http://soundcloud.com/%s' % resolve_title info_json_url = self._resolv_url(url) info = self._download_json(info_json_url, full_title, 'Downloading info JSON') return self._extract_info_dict(info, full_title, secret_token=token) class SoundcloudSetIE(SoundcloudIE): _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' IE_NAME = 'soundcloud:set' _TESTS = [{ 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', 'info_dict': { 'id': '2284613', 'title': 'The Royal Concept EP', }, 'playlist_mincount': 6, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) # extract uploader (which is in the url) uploader = mobj.group('uploader') # extract simple title (uploader + slug of song title) slug_title = mobj.group('slug_title') full_title = '%s/sets/%s' % (uploader, slug_title) url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) token = mobj.group('token') if token: full_title += '/' + token url += '/' + token self.report_resolve(full_title) resolv_url = self._resolv_url(url) info = self._download_json(resolv_url, full_title) if 'errors' in info: msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in info['tracks']] return { '_type': 'playlist', 'entries': entries, 'id': '%s' % info['id'], 'title': info['title'], } class SoundcloudUserIE(SoundcloudIE): _VALID_URL = r'''(?x) https?:// (?:(?:www|m)\.)?soundcloud\.com/ (?P<user>[^/]+) (?:/ (?P<rsrc>tracks|sets|reposts|likes|spotlight) )? /?(?:[?#].*)?$ ''' IE_NAME = 'soundcloud:user' _TESTS = [{ 'url': 'https://soundcloud.com/the-akashic-chronicler', 'info_dict': { 'id': '114582580', 'title': 'The Akashic Chronicler (All)', }, 'playlist_mincount': 111, }, { 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', 'info_dict': { 'id': '114582580', 'title': 'The Akashic Chronicler (Tracks)', }, 'playlist_mincount': 50, }, { 'url': 'https://soundcloud.com/the-akashic-chronicler/sets', 'info_dict': { 'id': '114582580', 'title': 'The Akashic Chronicler (Playlists)', }, 'playlist_mincount': 3, }, { 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', 'info_dict': { 'id': '114582580', 'title': 'The Akashic Chronicler (Reposts)', }, 'playlist_mincount': 7, }, { 'url': 'https://soundcloud.com/the-akashic-chronicler/likes', 'info_dict': { 'id': '114582580', 'title': 'The Akashic Chronicler (Likes)', }, 'playlist_mincount': 321, }, { 'url': 'https://soundcloud.com/grynpyret/spotlight', 'info_dict': { 'id': '7098329', 'title': 'Grynpyret (Spotlight)', }, 'playlist_mincount': 1, }] _API_BASE = 'https://api.soundcloud.com' _API_V2_BASE = 'https://api-v2.soundcloud.com' _BASE_URL_MAP = { 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE, 'tracks': '%s/users/%%s/tracks' % _API_BASE, 'sets': '%s/users/%%s/playlists' % _API_V2_BASE, 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE, 'likes': '%s/users/%%s/likes' % _API_V2_BASE, 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE, } _TITLE_MAP = { 'all': 'All', 'tracks': 'Tracks', 'sets': 'Playlists', 'reposts': 'Reposts', 'likes': 'Likes', 'spotlight': 'Spotlight', } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group('user') url = 'http://soundcloud.com/%s/' % uploader resolv_url = self._resolv_url(url) user = self._download_json( resolv_url, uploader, 'Downloading user info') resource = mobj.group('rsrc') or 'all' base_url = self._BASE_URL_MAP[resource] % user['id'] COMMON_QUERY = { 'limit': 50, 'client_id': self._CLIENT_ID, 'linked_partitioning': '1', } query = COMMON_QUERY.copy() query['offset'] = 0 next_href = base_url + '?' + compat_urllib_parse.urlencode(query) entries = [] for i in itertools.count(): response = self._download_json( next_href, uploader, 'Downloading track page %s' % (i + 1)) collection = response['collection'] if not collection: break def resolve_permalink_url(candidates): for cand in candidates: if isinstance(cand, dict): permalink_url = cand.get('permalink_url') if permalink_url and permalink_url.startswith('http'): return permalink_url for e in collection: permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) if permalink_url: entries.append(self.url_result(permalink_url)) next_href = response.get('next_href') if not next_href: break parsed_next_href = compat_urlparse.urlparse(response['next_href']) qs = compat_urlparse.parse_qs(parsed_next_href.query) qs.update(COMMON_QUERY) next_href = compat_urlparse.urlunparse( parsed_next_href._replace(query=compat_urllib_parse.urlencode(qs, True))) return { '_type': 'playlist', 'id': compat_str(user['id']), 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]), 'entries': entries, } class SoundcloudPlaylistIE(SoundcloudIE): _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' IE_NAME = 'soundcloud:playlist' _TESTS = [{ 'url': 'http://api.soundcloud.com/playlists/4110309', 'info_dict': { 'id': '4110309', 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', 'description': 're:.*?TILT Brass - Bowery Poetry Club', }, 'playlist_count': 6, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') base_url = '%s//api.soundcloud.com/playlists/%s.json?' % (self.http_scheme(), playlist_id) data_dict = { 'client_id': self._CLIENT_ID, } token = mobj.group('token') if token: data_dict['secret_token'] = token data = compat_urllib_parse.urlencode(data_dict) data = self._download_json( base_url + data, playlist_id, 'Downloading playlist') entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in data['tracks']] return { '_type': 'playlist', 'id': playlist_id, 'title': data.get('title'), 'description': data.get('description'), 'entries': entries, } class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): IE_NAME = 'soundcloud:search' IE_DESC = 'Soundcloud search' _MAX_RESULTS = float('inf') _TESTS = [{ 'url': 'scsearch15:post-avant jazzcore', 'info_dict': { 'title': 'post-avant jazzcore', }, 'playlist_count': 15, }] _SEARCH_KEY = 'scsearch' _MAX_RESULTS_PER_PAGE = 200 _DEFAULT_RESULTS_PER_PAGE = 50 _API_V2_BASE = 'https://api-v2.soundcloud.com' def _get_collection(self, endpoint, collection_id, **query): limit = min( query.get('limit', self._DEFAULT_RESULTS_PER_PAGE), self._MAX_RESULTS_PER_PAGE) query['limit'] = limit query['client_id'] = self._CLIENT_ID query['linked_partitioning'] = '1' query['offset'] = 0 data = compat_urllib_parse.urlencode(encode_dict(query)) next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data) collected_results = 0 for i in itertools.count(1): response = self._download_json( next_url, collection_id, 'Downloading page {0}'.format(i), 'Unable to download API page') collection = response.get('collection', []) if not collection: break collection = list(filter(bool, collection)) collected_results += len(collection) for item in collection: yield self.url_result(item['uri'], SoundcloudIE.ie_key()) if not collection or collected_results >= limit: break next_url = response.get('next_href') if not next_url: break def _get_n_results(self, query, n): tracks = self._get_collection('/search/tracks', query, limit=n, q=query) return self.playlist_result(tracks, playlist_title=query) ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/streamcz.py���������������������������������������������������������0000644�0000000�0000000�00000006321�12641030331�020230� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# -*- coding: utf-8 -*- from __future__ import unicode_literals import hashlib import time from .common import InfoExtractor from ..utils import ( int_or_none, sanitized_Request, ) def _get_api_key(api_path): if api_path.endswith('?'): api_path = api_path[:-1] api_key = 'fb5f58a820353bd7095de526253c14fd' a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600))) return hashlib.md5(a.encode('ascii')).hexdigest() class StreamCZIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)' _API_URL = 'http://www.stream.cz/API' _TESTS = [{ 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', 'md5': '6d3ca61a8d0633c9c542b92fcb936b0c', 'info_dict': { 'id': '765767', 'ext': 'mp4', 'title': 'Peklo na talíři: Éčka pro děti', 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE', 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100', 'duration': 256, }, }, { 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', 'md5': 'e54a254fb8b871968fd8403255f28589', 'info_dict': { 'id': '10002447', 'ext': 'mp4', 'title': 'Kancelář Blaník: Tři roky pro Mazánka', 'description': 'md5:3862a00ba7bf0b3e44806b544032c859', 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000', 'duration': 368, }, }] def _real_extract(self, url): video_id = self._match_id(url) api_path = '/episode/%s' % video_id req = sanitized_Request(self._API_URL + api_path) req.add_header('Api-Password', _get_api_key(api_path)) data = self._download_json(req, video_id) formats = [] for quality, video in enumerate(data['video_qualities']): for f in video['formats']: typ = f['type'].partition('/')[2] qlabel = video.get('quality_label') formats.append({ 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ, 'format_id': '%s-%s' % (typ, f['quality']), 'url': f['source'], 'height': int_or_none(f['quality'].rstrip('p')), 'quality': quality, }) self._sort_formats(formats) image = data.get('image') if image: thumbnail = self._proto_relative_url( image.replace('{width}', '1240').replace('{height}', '697'), scheme='http:', ) else: thumbnail = None stream = data.get('_embedded', {}).get('stream:show', {}).get('name') if stream: title = '%s: %s' % (stream, data['name']) else: title = data['name'] return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'formats': formats, 'description': data.get('web_site_text'), 'duration': int_or_none(data.get('duration')), 'view_count': int_or_none(data.get('views')), } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/teachertube.py������������������������������������������������������0000644�0000000�0000000�00000011053�12641030331�020671� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# -*- coding: utf-8 -*- from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( qualities, determine_ext, ) class TeacherTubeIE(InfoExtractor): IE_NAME = 'teachertube' IE_DESC = 'teachertube.com videos' _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=|video/(?:[\da-z-]+-)?|audio/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997', 'md5': 'f9434ef992fd65936d72999951ee254c', 'info_dict': { 'id': '339997', 'ext': 'mp4', 'title': 'Measures of dispersion from a frequency table', 'description': 'Measures of dispersion from a frequency table', 'thumbnail': 're:http://.*\.jpg', }, }, { 'url': 'http://www.teachertube.com/viewVideo.php?video_id=340064', 'md5': '0d625ec6bc9bf50f70170942ad580676', 'info_dict': { 'id': '340064', 'ext': 'mp4', 'title': 'How to Make Paper Dolls _ Paper Art Projects', 'description': 'Learn how to make paper dolls in this simple', 'thumbnail': 're:http://.*\.jpg', }, }, { 'url': 'http://www.teachertube.com/music.php?music_id=8805', 'md5': '01e8352006c65757caf7b961f6050e21', 'info_dict': { 'id': '8805', 'ext': 'mp3', 'title': 'PER ASPERA AD ASTRA', 'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNI?KE ?KOLE P', }, }, { 'url': 'http://www.teachertube.com/video/intro-video-schleicher-297790', 'md5': '9c79fbb2dd7154823996fc28d4a26998', 'info_dict': { 'id': '297790', 'ext': 'mp4', 'title': 'Intro Video - Schleicher', 'description': 'Intro Video - Why to flip, how flipping will', }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_meta('title', webpage, 'title', fatal=True) TITLE_SUFFIX = ' - TeacherTube' if title.endswith(TITLE_SUFFIX): title = title[:-len(TITLE_SUFFIX)].strip() description = self._html_search_meta('description', webpage, 'description') if description: description = description.strip() quality = qualities(['mp3', 'flv', 'mp4']) media_urls = re.findall(r'data-contenturl="([^"]+)"', webpage) media_urls.extend(re.findall(r'var\s+filePath\s*=\s*"([^"]+)"', webpage)) media_urls.extend(re.findall(r'\'file\'\s*:\s*["\']([^"\']+)["\'],', webpage)) formats = [ { 'url': media_url, 'quality': quality(determine_ext(media_url)) } for media_url in set(media_urls) ] self._sort_formats(formats) return { 'id': video_id, 'title': title, 'thumbnail': self._html_search_regex(r'\'image\'\s*:\s*["\']([^"\']+)["\']', webpage, 'thumbnail'), 'formats': formats, 'description': description, } class TeacherTubeUserIE(InfoExtractor): IE_NAME = 'teachertube:user:collection' IE_DESC = 'teachertube.com user and collection videos' _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?' _MEDIA_RE = r'''(?sx) class="?sidebar_thumb_time"?>[0-9:]+</div> \s* <a\s+href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)" ''' _TEST = { 'url': 'http://www.teachertube.com/user/profile/rbhagwati2', 'info_dict': { 'id': 'rbhagwati2' }, 'playlist_mincount': 179, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('user') urls = [] webpage = self._download_webpage(url, user_id) urls.extend(re.findall(self._MEDIA_RE, webpage)) pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[:-1] for p in pages: more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p) webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages))) video_urls = re.findall(self._MEDIA_RE, webpage) urls.extend(video_urls) entries = [self.url_result(vurl, 'TeacherTube') for vurl in urls] return self.playlist_result(entries, user_id) �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/gdcvault.py���������������������������������������������������������0000644�0000000�0000000�00000017006�12641030331�020213� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( remove_end, HEADRequest, sanitized_Request, ) class GDCVaultIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)?' _NETRC_MACHINE = 'gdcvault' _TESTS = [ { 'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple', 'md5': '7ce8388f544c88b7ac11c7ab1b593704', 'info_dict': { 'id': '1019721', 'display_id': 'Doki-Doki-Universe-Sweet-Simple', 'ext': 'mp4', 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)' } }, { 'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of', 'info_dict': { 'id': '1015683', 'display_id': 'Embracing-the-Dark-Art-of', 'ext': 'flv', 'title': 'Embracing the Dark Art of Mathematical Modeling in AI' }, 'params': { 'skip_download': True, # Requires rtmpdump } }, { 'url': 'http://www.gdcvault.com/play/1015301/Thexder-Meets-Windows-95-or', 'md5': 'a5eb77996ef82118afbbe8e48731b98e', 'info_dict': { 'id': '1015301', 'display_id': 'Thexder-Meets-Windows-95-or', 'ext': 'flv', 'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment', }, 'skip': 'Requires login', }, { 'url': 'http://gdcvault.com/play/1020791/', 'only_matching': True, } ] def _parse_mp4(self, xml_description): video_formats = [] mp4_video = xml_description.find('./metadata/mp4video') if mp4_video is None: return None mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text) video_root = mobj.group('root') formats = xml_description.findall('./metadata/MBRVideos/MBRVideo') for format in formats: mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text) url = video_root + mobj.group('path') vbr = format.find('bitrate').text video_formats.append({ 'url': url, 'vbr': int(vbr), }) return video_formats def _parse_flv(self, xml_description): formats = [] akamai_url = xml_description.find('./metadata/akamaiHost').text audios = xml_description.find('./metadata/audios') if audios is not None: for audio in audios: formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(audio.get('url'), '.flv'), 'ext': 'flv', 'vcodec': 'none', 'format_id': audio.get('code'), }) slide_video_path = xml_description.find('./metadata/slideVideo').text formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(slide_video_path, '.flv'), 'ext': 'flv', 'format_note': 'slide deck video', 'quality': -2, 'preference': -2, 'format_id': 'slides', }) speaker_video_path = xml_description.find('./metadata/speakerVideo').text formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(speaker_video_path, '.flv'), 'ext': 'flv', 'format_note': 'speaker video', 'quality': -1, 'preference': -1, 'format_id': 'speaker', }) return formats def _login(self, webpage_url, display_id): (username, password) = self._get_login_info() if username is None or password is None: self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.') return None mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url) login_url = mobj.group('root_url') + 'api/login.php' logout_url = mobj.group('root_url') + 'logout' login_form = { 'email': username, 'password': password, } request = sanitized_Request(login_url, compat_urllib_parse.urlencode(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') self._download_webpage(request, display_id, 'Logging in') start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') self._download_webpage(logout_url, display_id, 'Logging out') return start_page def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('name') or video_id webpage_url = 'http://www.gdcvault.com/play/' + video_id start_page = self._download_webpage(webpage_url, display_id) direct_url = self._search_regex( r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);', start_page, 'url', default=None) if direct_url: title = self._html_search_regex( r'<td><strong>Session Name</strong></td>\s*<td>(.*?)</td>', start_page, 'title') video_url = 'http://www.gdcvault.com' + direct_url # resolve the url so that we can detect the correct extension head = self._request_webpage(HEADRequest(video_url), video_id) video_url = head.geturl() return { 'id': video_id, 'display_id': display_id, 'url': video_url, 'title': title, } xml_root = self._html_search_regex( r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root', default=None) if xml_root is None: # Probably need to authenticate login_res = self._login(webpage_url, display_id) if login_res is None: self.report_warning('Could not login.') else: start_page = login_res # Grab the url from the authenticated page xml_root = self._html_search_regex( r'<iframe src="(.*?)player.html.*?".*?</iframe>', start_page, 'xml root') xml_name = self._html_search_regex( r'<iframe src=".*?\?xml=(.+?\.xml).*?".*?</iframe>', start_page, 'xml filename', default=None) if xml_name is None: # Fallback to the older format xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename') xml_description_url = xml_root + 'xml/' + xml_name xml_description = self._download_xml(xml_description_url, display_id) video_title = xml_description.find('./metadata/title').text video_formats = self._parse_mp4(xml_description) if video_formats is None: video_formats = self._parse_flv(xml_description) return { 'id': video_id, 'display_id': display_id, 'title': video_title, 'formats': video_formats, } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/bild.py�������������������������������������������������������������0000644�0000000�0000000�00000002522�12641030331�017311� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, unescapeHTML, ) class BildIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html' IE_DESC = 'Bild.de' _TEST = { 'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html', 'md5': 'dd495cbd99f2413502a1713a1156ac8a', 'info_dict': { 'id': '38184146', 'ext': 'mp4', 'title': 'Das können die neuen iPads', 'description': 'md5:a4058c4fa2a804ab59c00d7244bbf62f', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 196, } } def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( url.split('.bild.html')[0] + ',view=json.bild.html', video_id) return { 'id': video_id, 'title': unescapeHTML(video_data['title']).strip(), 'description': unescapeHTML(video_data.get('description')), 'url': video_data['clipList'][0]['srces'][0]['src'], 'thumbnail': video_data.get('poster'), 'duration': int_or_none(video_data.get('durationSec')), } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/xbef.py�������������������������������������������������������������0000644�0000000�0000000�00000002644�12641030331�017330� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote class XBefIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking', 'md5': 'a478b565baff61634a98f5e5338be995', 'info_dict': { 'id': '5119', 'ext': 'mp4', 'title': 'md5:7358a9faef8b7b57acda7c04816f170e', 'age_limit': 18, 'thumbnail': 're:^http://.*\.jpg', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( r'<h1[^>]*>(.*?)</h1>', webpage, 'title') config_url_enc = self._download_webpage( 'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id, note='Retrieving config URL') config_url = compat_urllib_parse_unquote(config_url_enc) config = self._download_xml( config_url, video_id, note='Retrieving config') video_url = config.find('./file').text thumbnail = config.find('./image').text return { 'id': video_id, 'url': video_url, 'title': title, 'thumbnail': thumbnail, 'age_limit': 18, } ��������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/indavideo.py��������������������������������������������������������0000644�0000000�0000000�00000011465�12641030331�020347� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, parse_age_limit, parse_iso8601, ) class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)' _TESTS = [{ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', 'md5': 'f79b009c66194acacd40712a6778acfa', 'info_dict': { 'id': '1837039', 'ext': 'mp4', 'title': 'Cicatánc', 'description': '', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'cukiajanlo', 'uploader_id': '83729', 'timestamp': 1439193826, 'upload_date': '20150810', 'duration': 72, 'age_limit': 0, 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'], }, }, { 'url': 'http://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1', 'only_matching': True, }, { 'url': 'http://assets.indavideo.hu/swf/player.swf?v=fe25e500&vID=1bdc3c6d80&autostart=1&hide=1&i=1', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( 'http://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id, video_id)['data'] title = video['title'] video_urls = video.get('video_files', []) video_file = video.get('video_file') if video: video_urls.append(video_file) video_urls = list(set(video_urls)) video_prefix = video_urls[0].rsplit('/', 1)[0] for flv_file in video.get('flv_files', []): flv_url = '%s/%s' % (video_prefix, flv_file) if flv_url not in video_urls: video_urls.append(flv_url) formats = [{ 'url': video_url, 'height': self._search_regex(r'\.(\d{3,4})\.mp4$', video_url, 'height', default=None), } for video_url in video_urls] self._sort_formats(formats) timestamp = video.get('date') if timestamp: # upload date is in CEST timestamp = parse_iso8601(timestamp + ' +0200', ' ') thumbnails = [{ 'url': self._proto_relative_url(thumbnail) } for thumbnail in video.get('thumbnails', [])] tags = [tag['title'] for tag in video.get('tags', [])] return { 'id': video.get('id') or video_id, 'title': title, 'description': video.get('description'), 'thumbnails': thumbnails, 'uploader': video.get('user_name'), 'uploader_id': video.get('user_id'), 'timestamp': timestamp, 'duration': int_or_none(video.get('length')), 'age_limit': parse_age_limit(video.get('age_limit')), 'tags': tags, 'formats': formats, } class IndavideoIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P<id>[^/#?]+)' _TESTS = [{ 'url': 'http://indavideo.hu/video/Vicces_cica_1', 'md5': '8c82244ba85d2a2310275b318eb51eac', 'info_dict': { 'id': '1335611', 'display_id': 'Vicces_cica_1', 'ext': 'mp4', 'title': 'Vicces cica', 'description': 'Játszik a tablettel. :D', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Jet_Pack', 'uploader_id': '491217', 'timestamp': 1390821212, 'upload_date': '20140127', 'duration': 7, 'age_limit': 0, 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], }, }, { 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz', 'only_matching': True, }, { 'url': 'http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko', 'only_matching': True, }, { 'url': 'http://erotika.indavideo.hu/video/Amator_tini_punci', 'only_matching': True, }, { 'url': 'http://film.indavideo.hu/video/f_hrom_nagymamm_volt', 'only_matching': True, }, { 'url': 'http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes', 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) embed_url = self._search_regex( r'<link[^>]+rel="video_src"[^>]+href="(.+?)"', webpage, 'embed url') return { '_type': 'url_transparent', 'ie_key': 'IndavideoEmbed', 'url': embed_url, 'display_id': display_id, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/dvtv.py�������������������������������������������������������������0000644�0000000�0000000�00000010561�12641030331�017364� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( js_to_json, unescapeHTML, ExtractorError, ) class DVTVIE(InfoExtractor): IE_NAME = 'dvtv' IE_DESC = 'http://video.aktualne.cz/' _VALID_URL = r'http://video\.aktualne\.cz/(?:[^/]+/)+r~(?P<id>[0-9a-f]{32})' _TESTS = [{ 'url': 'http://video.aktualne.cz/dvtv/vondra-o-ceskem-stoleti-pri-pohledu-na-havla-mi-bylo-trapne/r~e5efe9ca855511e4833a0025900fea04/', 'md5': '67cb83e4a955d36e1b5d31993134a0c2', 'info_dict': { 'id': 'dc0768de855511e49e4b0025900fea04', 'ext': 'mp4', 'title': 'Vondra o Českém století: Při pohledu na Havla mi bylo trapně', } }, { 'url': 'http://video.aktualne.cz/dvtv/stropnicky-policie-vrbetice-preventivne-nekontrolovala/r~82ed4322849211e4a10c0025900fea04/', 'md5': '6388f1941b48537dbd28791f712af8bf', 'info_dict': { 'id': '72c02230849211e49f60002590604f2e', 'ext': 'mp4', 'title': 'Stropnický: Policie Vrbětice preventivně nekontrolovala', } }, { 'url': 'http://video.aktualne.cz/dvtv/dvtv-16-12-2014-utok-talibanu-boj-o-kliniku-uprchlici/r~973eb3bc854e11e498be002590604f2e/', 'info_dict': { 'title': 'DVTV 16. 12. 2014: útok Talibanu, boj o kliniku, uprchlíci', 'id': '973eb3bc854e11e498be002590604f2e', }, 'playlist': [{ 'md5': 'da7ca6be4935532241fa9520b3ad91e4', 'info_dict': { 'id': 'b0b40906854d11e4bdad0025900fea04', 'ext': 'mp4', 'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne' } }, { 'md5': '5f7652a08b05009c1292317b449ffea2', 'info_dict': { 'id': '420ad9ec854a11e4bdad0025900fea04', 'ext': 'mp4', 'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka' } }, { 'md5': '498eb9dfa97169f409126c617e2a3d64', 'info_dict': { 'id': '95d35580846a11e4b6d20025900fea04', 'ext': 'mp4', 'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?' } }, { 'md5': 'b8dc6b744844032dab6ba3781a7274b9', 'info_dict': { 'id': '6fe14d66853511e4833a0025900fea04', 'ext': 'mp4', 'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády' } }], }, { 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/', 'only_matching': True, }] def _parse_video_metadata(self, js, video_id): metadata = self._parse_json(js, video_id, transform_source=js_to_json) formats = [] for video in metadata['sources']: ext = video['type'][6:] formats.append({ 'url': video['file'], 'ext': ext, 'format_id': '%s-%s' % (ext, video['label']), 'height': int(video['label'].rstrip('p')), 'fps': 25, }) self._sort_formats(formats) return { 'id': metadata['mediaid'], 'title': unescapeHTML(metadata['title']), 'thumbnail': self._proto_relative_url(metadata['image'], 'http:'), 'formats': formats } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) # single video item = self._search_regex( r"(?s)embedData[0-9a-f]{32}\['asset'\]\s*=\s*(\{.+?\});", webpage, 'video', default=None, fatal=False) if item: return self._parse_video_metadata(item, video_id) # playlist items = re.findall( r"(?s)BBX\.context\.assets\['[0-9a-f]{32}'\]\.push\(({.+?})\);", webpage) if items: return { '_type': 'playlist', 'id': video_id, 'title': self._og_search_title(webpage), 'entries': [self._parse_video_metadata(i, video_id) for i in items] } raise ExtractorError('Could not find neither video nor playlist') �����������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/footyroom.py��������������������������������������������������������0000644�0000000�0000000�00000003157�12641030331�020441� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class FootyRoomIE(InfoExtractor): _VALID_URL = r'http://footyroom\.com/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://footyroom.com/schalke-04-0-2-real-madrid-2015-02/', 'info_dict': { 'id': 'schalke-04-0-2-real-madrid-2015-02', 'title': 'Schalke 04 0 – 2 Real Madrid', }, 'playlist_count': 3, 'skip': 'Video for this match is not available', }, { 'url': 'http://footyroom.com/georgia-0-2-germany-2015-03/', 'info_dict': { 'id': 'georgia-0-2-germany-2015-03', 'title': 'Georgia 0 – 2 Germany', }, 'playlist_count': 1, }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) playlist = self._parse_json( self._search_regex( r'VideoSelector\.load\((\[.+?\])\);', webpage, 'video selector'), playlist_id) playlist_title = self._og_search_title(webpage) entries = [] for video in playlist: payload = video.get('payload') if not payload: continue playwire_url = self._search_regex( r'data-config="([^"]+)"', payload, 'playwire url', default=None) if playwire_url: entries.append(self.url_result(self._proto_relative_url( playwire_url, 'http:'), 'Playwire')) return self.playlist_result(entries, playlist_id, playlist_title) �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/googlesearch.py�����������������������������������������������������0000644�0000000�0000000�00000003243�12641030331�021042� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import itertools import re from .common import SearchInfoExtractor from ..compat import ( compat_urllib_parse, ) class GoogleSearchIE(SearchInfoExtractor): IE_DESC = 'Google Video search' _MAX_RESULTS = 1000 IE_NAME = 'video.google:search' _SEARCH_KEY = 'gvsearch' _TEST = { 'url': 'gvsearch15:python language', 'info_dict': { 'id': 'python language', 'title': 'python language', }, 'playlist_count': 15, } def _get_n_results(self, query, n): """Get a specified number of results for a query""" entries = [] res = { '_type': 'playlist', 'id': query, 'title': query, } for pagenum in itertools.count(): result_url = ( 'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum * 10)) webpage = self._download_webpage( result_url, 'gvsearch:' + query, note='Downloading result page ' + str(pagenum + 1)) for hit_idx, mobj in enumerate(re.finditer( r'<h3 class="r"><a href="([^"]+)"', webpage)): # Skip playlists if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage): continue entries.append({ '_type': 'url', 'url': mobj.group(1) }) if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage): res['entries'] = entries[:n] return res �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/byutv.py������������������������������������������������������������0000644�0000000�0000000�00000003221�12641030331�017545� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..utils import ExtractorError class BYUtvIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)' _TEST = { 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', 'info_dict': { 'id': 'studio-c-season-5-episode-5', 'ext': 'mp4', 'description': 'md5:e07269172baff037f8e8bf9956bc9747', 'title': 'Season 5 Episode 5', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 1486.486, }, 'params': { 'skip_download': True, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') webpage = self._download_webpage(url, video_id) episode_code = self._search_regex( r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information') episode_json = re.sub( r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', episode_code) ep = json.loads(episode_json) if ep['providerType'] == 'Ooyala': return { '_type': 'url_transparent', 'ie_key': 'Ooyala', 'url': 'ooyala:%s' % ep['providerId'], 'id': video_id, 'title': ep['title'], 'description': ep.get('description'), 'thumbnail': ep.get('imageThumbnail'), } else: raise ExtractorError('Unsupported provider %s' % ep['provider']) �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/echomsk.py����������������������������������������������������������0000644�0000000�0000000�00000002443�12641030331�020032� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class EchoMskIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?echo\.msk\.ru/sounds/(?P<id>\d+)' _TEST = { 'url': 'http://www.echo.msk.ru/sounds/1464134.html', 'md5': '2e44b3b78daff5b458e4dbc37f191f7c', 'info_dict': { 'id': '1464134', 'ext': 'mp3', 'title': 'Особое мнение - 29 декабря 2014, 19:08', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) audio_url = self._search_regex( r'<a rel="mp3" href="([^"]+)">', webpage, 'audio URL') title = self._html_search_regex( r'<a href="/programs/[^"]+" target="_blank">([^<]+)</a>', webpage, 'title') air_date = self._html_search_regex( r'(?s)<div class="date">(.+?)</div>', webpage, 'date', fatal=False, default=None) if air_date: air_date = re.sub(r'(\s)\1+', r'\1', air_date) if air_date: title = '%s - %s' % (title, air_date) return { 'id': video_id, 'url': audio_url, 'title': title, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/nfl.py��������������������������������������������������������������0000644�0000000�0000000�00000021661�12641030331�017163� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( ExtractorError, int_or_none, remove_end, ) class NFLIE(InfoExtractor): IE_NAME = 'nfl.com' _VALID_URL = r'''(?x) https?:// (?P<host> (?:www\.)? (?: (?: nfl| buffalobills| miamidolphins| patriots| newyorkjets| baltimoreravens| bengals| clevelandbrowns| steelers| houstontexans| colts| jaguars| titansonline| denverbroncos| kcchiefs| raiders| chargers| dallascowboys| giants| philadelphiaeagles| redskins| chicagobears| detroitlions| packers| vikings| atlantafalcons| panthers| neworleanssaints| buccaneers| azcardinals| stlouisrams| 49ers| seahawks )\.com| .+?\.clubs\.nfl\.com ) )/ (?:.+?/)* (?P<id>[^/#?&]+) ''' _TESTS = [{ 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', 'md5': '394ef771ddcd1354f665b471d78ec4c6', 'info_dict': { 'id': '0ap3000000398478', 'ext': 'mp4', 'title': 'Week 3: Redskins vs. Eagles highlights', 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', 'upload_date': '20140921', 'timestamp': 1411337580, 'thumbnail': 're:^https?://.*\.jpg$', } }, { 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c', 'info_dict': { 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266', 'ext': 'mp4', 'title': 'LIVE: Post Game vs. Browns', 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', 'upload_date': '20131229', 'timestamp': 1388354455, 'thumbnail': 're:^https?://.*\.jpg$', } }, { 'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish', 'info_dict': { 'id': '0ap3000000467607', 'ext': 'mp4', 'title': 'Frustrations flare on the field', 'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.', 'timestamp': 1422850320, 'upload_date': '20150202', }, }, { 'url': 'http://www.patriots.com/video/2015/09/18/10-days-gillette', 'md5': '4c319e2f625ffd0b481b4382c6fc124c', 'info_dict': { 'id': 'n-238346', 'ext': 'mp4', 'title': '10 Days at Gillette', 'description': 'md5:8cd9cd48fac16de596eadc0b24add951', 'timestamp': 1442618809, 'upload_date': '20150918', }, }, { # lowercase data-contentid 'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7', 'info_dict': { 'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2', 'ext': 'mp4', 'title': 'Tomlin looks ahead to Ravens on a short week', 'description': 'md5:32f3f7b139f43913181d5cbb24ecad75', 'timestamp': 1443459651, 'upload_date': '20150928', }, 'params': { 'skip_download': True, }, }, { 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood', 'only_matching': True, }, { 'url': 'http://www.buffalobills.com/video/videos/Rex_Ryan_Show_World_Wide_Rex/b1dcfab2-3190-4bb1-bfc0-d6e603d6601a', 'only_matching': True, }] @staticmethod def prepend_host(host, url): if not url.startswith('http'): if not url.startswith('/'): url = '/%s' % url url = 'http://{0:}{1:}'.format(host, url) return url @staticmethod def format_from_stream(stream, protocol, host, path_prefix='', preference=0, note=None): url = '{protocol:}://{host:}/{prefix:}{path:}'.format( protocol=protocol, host=host, prefix=path_prefix, path=stream.get('path'), ) return { 'url': url, 'vbr': int_or_none(stream.get('rate', 0), 1000), 'preference': preference, 'format_note': note, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id, host = mobj.group('id'), mobj.group('host') webpage = self._download_webpage(url, video_id) config_url = NFLIE.prepend_host(host, self._search_regex( r'(?:(?:config|configURL)\s*:\s*|<nflcs:avplayer[^>]+data-config\s*=\s*)(["\'])(?P<config>.+?)\1', webpage, 'config URL', default='static/content/static/config/video/config.json', group='config')) # For articles, the id in the url is not the video id video_id = self._search_regex( r'(?:<nflcs:avplayer[^>]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P<id>.+?)\1', webpage, 'video id', default=video_id, group='id') config = self._download_json(config_url, video_id, 'Downloading player config') url_template = NFLIE.prepend_host( host, '{contentURLTemplate:}'.format(**config)) video_data = self._download_json( url_template.format(id=video_id), video_id) formats = [] cdn_data = video_data.get('cdnData', {}) streams = cdn_data.get('bitrateInfo', []) if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM': parts = compat_urllib_parse_urlparse(cdn_data.get('uri')) protocol, host = parts.scheme, parts.netloc for stream in streams: formats.append( NFLIE.format_from_stream(stream, protocol, host)) else: cdns = config.get('cdns') if not cdns: raise ExtractorError('Failed to get CDN data', expected=True) for name, cdn in cdns.items(): # LimeLight streams don't seem to work if cdn.get('name') == 'LIMELIGHT': continue protocol = cdn.get('protocol') host = remove_end(cdn.get('host', ''), '/') if not (protocol and host): continue prefix = cdn.get('pathprefix', '') if prefix and not prefix.endswith('/'): prefix = '%s/' % prefix preference = 0 if protocol == 'rtmp': preference = -2 elif 'prog' in name.lower(): preference = 1 for stream in streams: formats.append( NFLIE.format_from_stream(stream, protocol, host, prefix, preference, name)) self._sort_formats(formats) thumbnail = None for q in ('xl', 'l', 'm', 's', 'xs'): thumbnail = video_data.get('imagePaths', {}).get(q) if thumbnail: break return { 'id': video_id, 'title': video_data.get('headline'), 'formats': formats, 'description': video_data.get('caption'), 'duration': video_data.get('duration'), 'thumbnail': thumbnail, 'timestamp': int_or_none(video_data.get('posted'), 1000), } �������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/atresplayer.py������������������������������������������������������0000644�0000000�0000000�00000017053�12650650456�020757� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import time import hmac import hashlib import re from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, ) from ..utils import ( int_or_none, float_or_none, sanitized_Request, xpath_text, ExtractorError, ) class AtresPlayerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html' _NETRC_MACHINE = 'atresplayer' _TESTS = [ { 'url': 'http://www.atresplayer.com/television/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_2014122100174.html', 'md5': 'efd56753cda1bb64df52a3074f62e38a', 'info_dict': { 'id': 'capitulo-10-especial-solidario-nochebuena', 'ext': 'mp4', 'title': 'Especial Solidario de Nochebuena', 'description': 'md5:e2d52ff12214fa937107d21064075bf1', 'duration': 5527.6, 'thumbnail': 're:^https?://.*\.jpg$', }, 'skip': 'This video is only available for registered users' }, { 'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', 'md5': '0d0e918533bbd4b263f2de4d197d4aac', 'info_dict': { 'id': 'capitulo-112-david-bustamante', 'ext': 'flv', 'title': 'David Bustamante', 'description': 'md5:f33f1c0a05be57f6708d4dd83a3b81c6', 'duration': 1439.0, 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html', 'only_matching': True, }, ] _USER_AGENT = 'Dalvik/1.6.0 (Linux; U; Android 4.3; GT-I9300 Build/JSS15J' _MAGIC = 'QWtMLXs414Yo+c#_+Q#K@NN)' _TIMESTAMP_SHIFT = 30000 _TIME_API_URL = 'http://servicios.atresplayer.com/api/admin/time.json' _URL_VIDEO_TEMPLATE = 'https://servicios.atresplayer.com/api/urlVideo/{1}/{0}/{1}|{2}|{3}.json' _PLAYER_URL_TEMPLATE = 'https://servicios.atresplayer.com/episode/getplayer.json?episodePk=%s' _EPISODE_URL_TEMPLATE = 'http://www.atresplayer.com/episodexml/%s' _LOGIN_URL = 'https://servicios.atresplayer.com/j_spring_security_check' _ERRORS = { 'UNPUBLISHED': 'We\'re sorry, but this video is not yet available.', 'DELETED': 'This video has expired and is no longer available for online streaming.', 'GEOUNPUBLISHED': 'We\'re sorry, but this video is not available in your region due to right restrictions.', # 'PREMIUM': 'PREMIUM', } def _real_initialize(self): self._login() def _login(self): (username, password) = self._get_login_info() if username is None: return login_form = { 'j_username': username, 'j_password': password, } request = sanitized_Request( self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) request.add_header('Content-Type', 'application/x-www-form-urlencoded') response = self._download_webpage( request, None, 'Logging in as %s' % username) error = self._html_search_regex( r'(?s)<ul class="list_error">(.+?)</ul>', response, 'error', default=None) if error: raise ExtractorError( 'Unable to login: %s' % error, expected=True) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) episode_id = self._search_regex( r'episode="([^"]+)"', webpage, 'episode id') request = sanitized_Request( self._PLAYER_URL_TEMPLATE % episode_id, headers={'User-Agent': self._USER_AGENT}) player = self._download_json(request, episode_id, 'Downloading player JSON') episode_type = player.get('typeOfEpisode') error_message = self._ERRORS.get(episode_type) if error_message: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) formats = [] video_url = player.get('urlVideo') if video_url: format_info = { 'url': video_url, 'format_id': 'http', } mobj = re.search(r'(?P<bitrate>\d+)K_(?P<width>\d+)x(?P<height>\d+)', video_url) if mobj: format_info.update({ 'width': int_or_none(mobj.group('width')), 'height': int_or_none(mobj.group('height')), 'tbr': int_or_none(mobj.group('bitrate')), }) formats.append(format_info) timestamp = int_or_none(self._download_webpage( self._TIME_API_URL, video_id, 'Downloading timestamp', fatal=False), 1000, time.time()) timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT) token = hmac.new( self._MAGIC.encode('ascii'), (episode_id + timestamp_shifted).encode('utf-8'), hashlib.md5 ).hexdigest() request = sanitized_Request( self._URL_VIDEO_TEMPLATE.format('windows', episode_id, timestamp_shifted, token), headers={'User-Agent': self._USER_AGENT}) fmt_json = self._download_json( request, video_id, 'Downloading windows video JSON') result = fmt_json.get('resultDes') if result.lower() != 'ok': raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, result), expected=True) for format_id, video_url in fmt_json['resultObject'].items(): if format_id == 'token' or not video_url.startswith('http'): continue if 'geodeswowsmpra3player' in video_url: f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) # this videos are protected by DRM, the f4m downloader doesn't support them continue else: f4m_url = video_url[:-9] + '/manifest.f4m' formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) path_data = player.get('pathData') episode = self._download_xml( self._EPISODE_URL_TEMPLATE % path_data, video_id, 'Downloading episode XML') duration = float_or_none(xpath_text( episode, './media/asset/info/technical/contentDuration', 'duration')) art = episode.find('./media/asset/info/art') title = xpath_text(art, './name', 'title') description = xpath_text(art, './description', 'description') thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail') subtitles = {} subtitle_url = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') if subtitle_url: subtitles['es'] = [{ 'ext': 'srt', 'url': subtitle_url, }] return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, 'subtitles': subtitles, } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/hypem.py������������������������������������������������������������0000644�0000000�0000000�00000003523�12641030331�017523� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import json import time from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, sanitized_Request, ) class HypemIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/' _TEST = { 'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME', 'md5': 'b9cc91b5af8995e9f0c1cee04c575828', 'info_dict': { 'id': '1v6ga', 'ext': 'mp3', 'title': 'Tame', 'uploader': 'BODYWORK', } } def _real_extract(self, url): track_id = self._match_id(url) data = {'ax': 1, 'ts': time.time()} request = sanitized_Request(url + '?' + compat_urllib_parse.urlencode(data)) response, urlh = self._download_webpage_handle( request, track_id, 'Downloading webpage with the url') html_tracks = self._html_search_regex( r'(?ms)<script type="application/json" id="displayList-data">(.+?)</script>', response, 'tracks') try: track_list = json.loads(html_tracks) track = track_list['tracks'][0] except ValueError: raise ExtractorError('Hypemachine contained invalid JSON.') key = track['key'] track_id = track['id'] title = track['song'] request = sanitized_Request( 'http://hypem.com/serve/source/%s/%s' % (track_id, key), '', {'Content-Type': 'application/json'}) song_data = self._download_json(request, track_id, 'Downloading metadata') final_url = song_data['url'] artist = track.get('artist') return { 'id': track_id, 'url': final_url, 'ext': 'mp3', 'title': title, 'uploader': artist, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/ivideon.py����������������������������������������������������������0000644�0000000�0000000�00000006254�12644050477�020062� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urlparse, ) from ..utils import qualities class IvideonIE(InfoExtractor): IE_NAME = 'ivideon' IE_DESC = 'Ivideon TV' _VALID_URL = r'https?://(?:www\.)?ivideon\.com/tv/(?:[^/]+/)*camera/(?P<id>\d+-[\da-f]+)/(?P<camera_id>\d+)' _TESTS = [{ 'url': 'https://www.ivideon.com/tv/camera/100-916ca13b5c4ad9f564266424a026386d/0/', 'info_dict': { 'id': '100-916ca13b5c4ad9f564266424a026386d', 'ext': 'flv', 'title': 're:^Касса [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'Основное предназначение - запись действий кассиров. Плюс общий вид.', 'is_live': True, }, 'params': { 'skip_download': True, } }, { 'url': 'https://www.ivideon.com/tv/camera/100-c4ee4cb9ede885cf62dfbe93d7b53783/589824/?lang=ru', 'only_matching': True, }, { 'url': 'https://www.ivideon.com/tv/map/22.917923/-31.816406/16/camera/100-e7bc16c7d4b5bbd633fd5350b66dfa9a/0', 'only_matching': True, }] _QUALITIES = ('low', 'mid', 'hi') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) server_id, camera_id = mobj.group('id'), mobj.group('camera_id') camera_name, description = None, None camera_url = compat_urlparse.urljoin( url, '/tv/camera/%s/%s/' % (server_id, camera_id)) webpage = self._download_webpage(camera_url, server_id, fatal=False) if webpage: config_string = self._search_regex( r'var\s+config\s*=\s*({.+?});', webpage, 'config', default=None) if config_string: config = self._parse_json(config_string, server_id, fatal=False) camera_info = config.get('ivTvAppOptions', {}).get('currentCameraInfo') if camera_info: camera_name = camera_info.get('camera_name') description = camera_info.get('misc', {}).get('description') if not camera_name: camera_name = self._html_search_meta( 'name', webpage, 'camera name', default=None) or self._search_regex( r'<h1[^>]+class="b-video-title"[^>]*>([^<]+)', webpage, 'camera name', default=None) quality = qualities(self._QUALITIES) formats = [{ 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse.urlencode({ 'server': server_id, 'camera': camera_id, 'sessionId': 'demo', 'q': quality(format_id), }), 'format_id': format_id, 'ext': 'flv', 'quality': quality(format_id), } for format_id in self._QUALITIES] self._sort_formats(formats) return { 'id': server_id, 'title': self._live_title(camera_name or server_id), 'description': description, 'is_live': True, 'formats': formats, } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/teamcoco.py���������������������������������������������������������0000644�0000000�0000000�00000014751�12641030331�020200� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# -*- coding: utf-8 -*- from __future__ import unicode_literals import base64 import binascii import re import json from .common import InfoExtractor from ..utils import ( ExtractorError, qualities, determine_ext, ) from ..compat import compat_ord class TeamcocoIE(InfoExtractor): _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)' _TESTS = [ { 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', 'md5': '3f7746aa0dc86de18df7539903d399ea', 'info_dict': { 'id': '80187', 'ext': 'mp4', 'title': 'Conan Becomes A Mary Kay Beauty Consultant', 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.', 'duration': 504, 'age_limit': 0, } }, { 'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush', 'md5': 'cde9ba0fa3506f5f017ce11ead928f9a', 'info_dict': { 'id': '19705', 'ext': 'mp4', 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', 'title': 'Louis C.K. Interview Pt. 1 11/3/11', 'duration': 288, 'age_limit': 0, } }, { 'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey', 'info_dict': { 'id': '88748', 'ext': 'mp4', 'title': 'Timothy Olyphant Raises A Toast To “Justified”', 'description': 'md5:15501f23f020e793aeca761205e42c24', }, 'params': { 'skip_download': True, # m3u8 downloads } }, { 'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9', 'info_dict': { 'id': '89341', 'ext': 'mp4', 'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett', 'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett', }, 'params': { 'skip_download': True, # m3u8 downloads } } ] _VIDEO_ID_REGEXES = ( r'"eVar42"\s*:\s*(\d+)', r'Ginger\.TeamCoco\.openInApp\("video",\s*"([^"]+)"', r'"id_not"\s*:\s*(\d+)' ) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('display_id') webpage, urlh = self._download_webpage_handle(url, display_id) if 'src=expired' in urlh.geturl(): raise ExtractorError('This video is expired.', expected=True) video_id = mobj.group('video_id') if not video_id: video_id = self._html_search_regex( self._VIDEO_ID_REGEXES, webpage, 'video id') data = None preload_codes = self._html_search_regex( r'(function.+)setTimeout\(function\(\)\{playlist', webpage, 'preload codes') base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes) base64_fragments.remove('init') def _check_sequence(cur_fragments): if not cur_fragments: return for i in range(len(cur_fragments)): cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii') try: raw_data = base64.b64decode(cur_sequence) if compat_ord(raw_data[0]) == compat_ord('{'): return json.loads(raw_data.decode('utf-8')) except (TypeError, binascii.Error, UnicodeDecodeError, ValueError): continue def _check_data(): for i in range(len(base64_fragments) + 1): for j in range(i, len(base64_fragments) + 1): data = _check_sequence(base64_fragments[:i] + base64_fragments[j:]) if data: return data self.to_screen('Try to compute possible data sequence. This may take some time.') data = _check_data() if not data: raise ExtractorError( 'Preload information could not be extracted', expected=True) formats = [] get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) for filed in data['files']: if determine_ext(filed['url']) == 'm3u8': # compat_urllib_parse.urljoin does not work here if filed['url'].startswith('/'): m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url'] else: m3u8_url = filed['url'] m3u8_formats = self._extract_m3u8_formats( m3u8_url, video_id, ext='mp4') for m3u8_format in m3u8_formats: if m3u8_format not in formats: formats.append(m3u8_format) elif determine_ext(filed['url']) == 'f4m': # TODO Correct f4m extraction continue else: if filed['url'].startswith('/mp4:protected/'): # TODO Correct extraction for these files continue m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) if m_format is not None: format_id = m_format.group(1) else: format_id = filed['bitrate'] tbr = ( int(filed['bitrate']) if filed['bitrate'].isdigit() else None) formats.append({ 'url': filed['url'], 'ext': 'mp4', 'tbr': tbr, 'format_id': format_id, 'quality': get_quality(format_id), }) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'formats': formats, 'title': data['title'], 'thumbnail': data.get('thumb', {}).get('href'), 'description': data.get('teaser'), 'duration': data.get('duration'), 'age_limit': self._family_friendly_search(webpage), } �����������������������youtube-dl/youtube_dl/extractor/bbc.py��������������������������������������������������������������0000644�0000000�0000000�00000121135�12660177411�017143� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, int_or_none, parse_duration, parse_iso8601, remove_end, unescapeHTML, ) from ..compat import ( compat_etree_fromstring, compat_HTTPError, ) class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' _ID_REGEX = r'[pb][\da-z]{7}' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ (?: programmes/(?!articles/)| iplayer(?:/[^/]+)?/(?:episode/|playlist/)| music/clips[/#]| radio/player/ ) (?P<id>%s) ''' % _ID_REGEX _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', ] _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' _NAMESPACES = ( _MEDIASELECTION_NS, _EMP_PLAYLIST_NS, ) _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', 'info_dict': { 'id': 'b039d07m', 'ext': 'flv', 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4', 'description': 'The Canadian poet and songwriter reflects on his musical career.', }, 'params': { # rtmp download 'skip_download': True, } }, { 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/', 'info_dict': { 'id': 'b00yng1d', 'ext': 'flv', 'title': 'The Man in Black: Series 3: The Printed Name', 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.", 'duration': 1800, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'Episode is no longer available on BBC iPlayer Radio', }, { 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', 'info_dict': { 'id': 'b00yng1d', 'ext': 'flv', 'title': 'The Voice UK: Series 3: Blind Auditions 5', 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.', 'duration': 5100, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', }, { 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion', 'info_dict': { 'id': 'b03k3pb7', 'ext': 'flv', 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction", 'description': '2. Invasion', 'duration': 3600, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', }, { 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', 'info_dict': { 'id': 'b04v209v', 'ext': 'flv', 'title': 'Pete Tong, The Essential New Tune Special', 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!", 'duration': 10800, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'Episode is no longer available on BBC iPlayer Radio', }, { 'url': 'http://www.bbc.co.uk/music/clips/p022h44b', 'note': 'Audio', 'info_dict': { 'id': 'p022h44j', 'ext': 'flv', 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances', 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.", 'duration': 227, }, 'params': { # rtmp download 'skip_download': True, } }, { 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', 'note': 'Video', 'info_dict': { 'id': 'p025c103', 'ext': 'flv', 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', 'duration': 226, }, 'params': { # rtmp download 'skip_download': True, } }, { 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls', 'info_dict': { 'id': 'p02n76xf', 'ext': 'flv', 'title': 'Natural World, 2015-2016: 2. Super Powered Owls', 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d', 'duration': 3540, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'geolocation', }, { 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', 'info_dict': { 'id': 'b05zmgw1', 'ext': 'flv', 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', 'title': 'Royal Academy Summer Exhibition', 'duration': 3540, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'geolocation', }, { # iptv-all mediaset fails with geolocation however there is no geo restriction # for this programme at all 'url': 'http://www.bbc.co.uk/programmes/b06rkn85', 'info_dict': { 'id': 'b06rkms3', 'ext': 'flv', 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1", 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!", }, 'params': { # rtmp download 'skip_download': True, }, }, { # compact player (https://github.com/rg3/youtube-dl/issues/8147) 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', 'info_dict': { 'id': 'p028bfkj', 'ext': 'flv', 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', }, 'params': { # rtmp download 'skip_download': True, }, }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, }, { 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', 'only_matching': True, }, { 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', 'only_matching': True, }, { 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', 'only_matching': True, } ] class MediaSelectionError(Exception): def __init__(self, id): self.id = id def _extract_asx_playlist(self, connection, programme_id): asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') return [ref.get('href') for ref in asx.findall('./Entry/ref')] def _extract_connection(self, connection, programme_id): formats = [] kind = connection.get('kind') protocol = connection.get('protocol') supplier = connection.get('supplier') if protocol == 'http': href = connection.get('href') transfer_format = connection.get('transferFormat') # ASX playlist if supplier == 'asx': for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): formats.append({ 'url': ref, 'format_id': 'ref%s_%s' % (i, supplier), }) # Skip DASH until supported elif transfer_format == 'dash': pass elif transfer_format == 'hls': formats.extend(self._extract_m3u8_formats( href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=supplier, fatal=False)) # Direct link else: formats.append({ 'url': href, 'format_id': supplier or kind or protocol, }) elif protocol == 'rtmp': application = connection.get('application', 'ondemand') auth_string = connection.get('authString') identifier = connection.get('identifier') server = connection.get('server') formats.append({ 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), 'play_path': identifier, 'app': '%s?%s' % (application, auth_string), 'page_url': 'http://www.bbc.co.uk', 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', 'rtmp_live': False, 'ext': 'flv', 'format_id': supplier, }) return formats def _extract_items(self, playlist): return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) def _findall_ns(self, element, xpath): elements = [] for ns in self._NAMESPACES: elements.extend(element.findall(xpath % ns)) return elements def _extract_medias(self, media_selection): error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) if error is None: media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) if error is not None: raise BBCCoUkIE.MediaSelectionError(error.get('id')) return self._findall_ns(media_selection, './{%s}media') def _extract_connections(self, media): return self._findall_ns(media, './{%s}connection') def _extract_video(self, media, programme_id): formats = [] vbr = int_or_none(media.get('bitrate')) vcodec = media.get('encoding') service = media.get('service') width = int_or_none(media.get('width')) height = int_or_none(media.get('height')) file_size = int_or_none(media.get('media_file_size')) for connection in self._extract_connections(media): conn_formats = self._extract_connection(connection, programme_id) for format in conn_formats: format.update({ 'width': width, 'height': height, 'vbr': vbr, 'vcodec': vcodec, 'filesize': file_size, }) if service: format['format_id'] = '%s_%s' % (service, format['format_id']) formats.extend(conn_formats) return formats def _extract_audio(self, media, programme_id): formats = [] abr = int_or_none(media.get('bitrate')) acodec = media.get('encoding') service = media.get('service') for connection in self._extract_connections(media): conn_formats = self._extract_connection(connection, programme_id) for format in conn_formats: format.update({ 'format_id': '%s_%s' % (service, format['format_id']), 'abr': abr, 'acodec': acodec, }) formats.extend(conn_formats) return formats def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') subtitles[lang] = [ { 'url': connection.get('href'), 'ext': 'ttml', }, ] return subtitles def _raise_extractor_error(self, media_selection_error): raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), expected=True) def _download_media_selector(self, programme_id): last_exception = None for mediaselector_url in self._MEDIASELECTOR_URLS: try: return self._download_media_selector_url( mediaselector_url % programme_id, programme_id) except BBCCoUkIE.MediaSelectionError as e: if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e continue self._raise_extractor_error(e) self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): try: media_selection = self._download_xml( url, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404): media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) else: raise return self._process_media_selector(media_selection, programme_id) def _process_media_selector(self, media_selection, programme_id): formats = [] subtitles = None for media in self._extract_medias(media_selection): kind = media.get('kind') if kind == 'audio': formats.extend(self._extract_audio(media, programme_id)) elif kind == 'video': formats.extend(self._extract_video(media, programme_id)) elif kind == 'captions': subtitles = self.extract_subtitles(media, programme_id) return formats, subtitles def _download_playlist(self, playlist_id): try: playlist = self._download_json( 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, playlist_id, 'Downloading playlist JSON') version = playlist.get('defaultAvailableVersion') if version: smp_config = version['smpConfig'] title = smp_config['title'] description = smp_config['summary'] for item in smp_config['items']: kind = item['kind'] if kind != 'programme' and kind != 'radioProgramme': continue programme_id = item.get('vpid') duration = int_or_none(item.get('duration')) formats, subtitles = self._download_media_selector(programme_id) return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): raise # fallback to legacy playlist return self._process_legacy_playlist(playlist_id) def _process_legacy_playlist_url(self, url, display_id): playlist = self._download_legacy_playlist_url(url, display_id) return self._extract_from_legacy_playlist(playlist, display_id) def _process_legacy_playlist(self, playlist_id): return self._process_legacy_playlist_url( 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id) def _download_legacy_playlist_url(self, url, playlist_id=None): return self._download_xml( url, playlist_id, 'Downloading legacy playlist XML') def _extract_from_legacy_playlist(self, playlist, playlist_id): no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS) if no_items is not None: reason = no_items.get('reason') if reason == 'preAvailability': msg = 'Episode %s is not yet available' % playlist_id elif reason == 'postAvailability': msg = 'Episode %s is no longer available' % playlist_id elif reason == 'noMedia': msg = 'Episode %s is not currently available' % playlist_id else: msg = 'Episode %s is not available: %s' % (playlist_id, reason) raise ExtractorError(msg, expected=True) for item in self._extract_items(playlist): kind = item.get('kind') if kind != 'programme' and kind != 'radioProgramme': continue title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) description = description_el.text if description_el is not None else None def get_programme_id(item): def get_from_attributes(item): for p in('identifier', 'group'): value = item.get(p) if value and re.match(r'^[pb][\da-z]{7}$', value): return value get_from_attributes(item) mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS) if mediator is not None: return get_from_attributes(mediator) programme_id = get_programme_id(item) duration = int_or_none(item.get('duration')) if programme_id: formats, subtitles = self._download_media_selector(programme_id) else: formats, subtitles = self._process_media_selector(item, playlist_id) programme_id = playlist_id return programme_id, title, description, duration, formats, subtitles def _real_extract(self, url): group_id = self._match_id(url) webpage = self._download_webpage(url, group_id, 'Downloading video page') programme_id = None duration = None tviplayer = self._search_regex( r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', webpage, 'player', default=None) if tviplayer: player = self._parse_json(tviplayer, group_id).get('player', {}) duration = int_or_none(player.get('duration')) programme_id = player.get('vpid') if not programme_id: programme_id = self._search_regex( r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None) if programme_id: formats, subtitles = self._download_media_selector(programme_id) title = self._og_search_title(webpage, default=None) or self._html_search_regex( (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>', r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title') description = self._search_regex( (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'), webpage, 'description', default=None) if not description: description = self._html_search_meta('description', webpage) else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) self._sort_formats(formats) return { 'id': programme_id, 'title': title, 'description': description, 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'duration': duration, 'formats': formats, 'subtitles': subtitles, } class BBCIE(BBCCoUkIE): IE_NAME = 'bbc' IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams but fails with geolocation in some cases when it's # even not geo restricted at all 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', # Provides more formats, namely direct mp4 links, but fails on some videos with # notukerror for non UK (?) users (e.g. # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', # Provides fewer formats, but works everywhere for everybody (hopefully) 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', ] _TESTS = [{ # article with multiple videos embedded with data-playable containing vpids 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', 'title': 'Russia stages massive WW2 parade despite Western boycott', 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, }, { # article with multiple videos embedded with data-playable (more videos) 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', 'title': 'Farnborough Airshow: Video highlights', 'description': 'BBC reports and video highlights at the Farnborough Airshow.', }, 'playlist_count': 9, 'skip': 'Save time', }, { # article with multiple videos embedded with `new SMP()` # broken 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', 'title': 'BBC Blogs - Adam Curtis - BUGGER', }, 'playlist_count': 18, }, { # single video embedded with data-playable containing vpid 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { 'id': 'p02mprgb', 'ext': 'mp4', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'description': 'md5:2868290467291b37feda7863f7a83f54', 'duration': 47, 'timestamp': 1427219242, 'upload_date': '20150324', }, 'params': { # rtmp download 'skip_download': True, } }, { # article with single video embedded with data-playable containing XML playlist # with direct video links as progressiveDownloadUrl (for now these are extracted) # and playlist with f4m and m3u8 as streamingUrl 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', 'info_dict': { 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", 'timestamp': 1434397334, 'upload_date': '20150615', }, 'params': { 'skip_download': True, } }, { # single video embedded with data-playable containing XML playlists (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'info_dict': { 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', 'timestamp': 1434713142, 'upload_date': '20150619', }, 'params': { 'skip_download': True, } }, { # single video from video playlist embedded with vxp-playlist-data JSON 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', 'info_dict': { 'id': 'p02w6qjc', 'ext': 'mp4', 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', 'duration': 56, 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', }, 'params': { 'skip_download': True, } }, { # single video story with digitalData 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', 'info_dict': { 'id': 'p02q6gc4', 'ext': 'flv', 'title': 'Sri Lanka’s spicy secret', 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', 'timestamp': 1437674293, 'upload_date': '20150723', }, 'params': { # rtmp download 'skip_download': True, } }, { # single video story without digitalData 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', 'info_dict': { 'id': 'p018zqqg', 'ext': 'mp4', 'title': 'Hyundai Santa Fe Sport: Rock star', 'description': 'md5:b042a26142c4154a6e472933cf20793d', 'timestamp': 1415867444, 'upload_date': '20141113', }, 'params': { # rtmp download 'skip_download': True, } }, { # single video with playlist.sxml URL in playlist param 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', 'ext': 'mp4', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', 'duration': 140, }, 'params': { # rtmp download 'skip_download': True, } }, { # article with multiple videos embedded with playlist.sxml in playlist param 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': '34475836', 'title': 'What Liverpool can expect from Klopp', }, 'playlist_count': 3, }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', 'only_matching': True, }, { # custom redirection to www.bbc.com 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', 'only_matching': True, }] @classmethod def suitable(cls, url): return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url) def _extract_from_media_meta(self, media_meta, video_id): # Direct links to media in media metadata (e.g. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml source_files = media_meta.get('sourceFiles') if source_files: return [{ 'url': f['url'], 'format_id': format_id, 'ext': f.get('encoding'), 'tbr': float_or_none(f.get('bitrate'), 1000), 'filesize': int_or_none(f.get('filesize')), } for format_id, f in source_files.items() if f.get('url')], [] programme_id = media_meta.get('externalId') if programme_id: return self._download_media_selector(programme_id) # Process playlist.sxml as legacy playlist href = media_meta.get('href') if href: playlist = self._download_legacy_playlist_url(href) _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id) return formats, subtitles return [], [] def _extract_from_playlist_sxml(self, url, playlist_id, timestamp): programme_id, title, description, duration, formats, subtitles = \ self._process_legacy_playlist_url(url, playlist_id) self._sort_formats(formats) return { 'id': programme_id, 'title': title, 'description': description, 'duration': duration, 'timestamp': timestamp, 'formats': formats, 'subtitles': subtitles, } def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) timestamp = json_ld_info.get('timestamp') playlist_title = json_ld_info.get('title') playlist_description = json_ld_info.get('description') if not timestamp: timestamp = parse_iso8601(self._search_regex( [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"', r'itemprop="datePublished"[^>]+datetime="([^"]+)"', r'"datePublished":\s*"([^"]+)'], webpage, 'date', default=None)) entries = [] # article with multiple videos embedded with playlist.sxml (e.g. # http://www.bbc.com/sport/0/football/34475836) playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage) playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage)) if playlists: entries = [ self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) for playlist_url in playlists] # news article with multiple videos embedded with data-playable data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage) if data_playables: for _, data_playable_json in data_playables: data_playable = self._parse_json( unescapeHTML(data_playable_json), playlist_id, fatal=False) if not data_playable: continue settings = data_playable.get('settings', {}) if settings: # data-playable with video vpid in settings.playlistObject.items (e.g. # http://www.bbc.com/news/world-us-canada-34473351) playlist_object = settings.get('playlistObject', {}) if playlist_object: items = playlist_object.get('items') if items and isinstance(items, list): title = playlist_object['title'] description = playlist_object.get('summary') duration = int_or_none(items[0].get('duration')) programme_id = items[0].get('vpid') formats, subtitles = self._download_media_selector(programme_id) self._sort_formats(formats) entries.append({ 'id': programme_id, 'title': title, 'description': description, 'timestamp': timestamp, 'duration': duration, 'formats': formats, 'subtitles': subtitles, }) else: # data-playable without vpid but with a playlist.sxml URLs # in otherSettings.playlist (e.g. # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) playlist = data_playable.get('otherSettings', {}).get('playlist', {}) if playlist: entries.append(self._extract_from_playlist_sxml( playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) if entries: playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News') playlist_description = playlist_description or self._og_search_description(webpage, default=None) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( [r'data-video-player-vpid="(%s)"' % self._ID_REGEX, r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], webpage, 'vpid', default=None) if programme_id: formats, subtitles = self._download_media_selector(programme_id) self._sort_formats(formats) # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) digital_data = self._parse_json( self._search_regex( r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'), programme_id, fatal=False) page_info = digital_data.get('page', {}).get('pageInfo', {}) title = page_info.get('pageName') or self._og_search_title(webpage) description = page_info.get('description') or self._og_search_description(webpage) timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp return { 'id': programme_id, 'title': title, 'description': description, 'timestamp': timestamp, 'formats': formats, 'subtitles': subtitles, } playlist_title = self._html_search_regex( r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title') playlist_description = self._og_search_description(webpage, default=None) def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), re.findall(pattern, webpage)))) # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX entries = [] for match in extract_all(r'new\s+SMP\(({.+?})\)'): embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') if embed_url and re.match(EMBED_URL, embed_url): entries.append(embed_url) entries.extend(re.findall( r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) if entries: return self.playlist_result( [self.url_result(entry, 'BBCCoUk') for entry in entries], playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) medias = extract_all(r"data-media-meta='({[^']+})'") if not medias: # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) media_asset = self._search_regex( r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset', default=None) if media_asset: media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False) medias = [] for video in media_asset_page.get('videos', {}).values(): medias.extend(video.values()) if not medias: # Multiple video playlist with single `now playing` entry (e.g. # http://www.bbc.com/news/video_and_audio/must_see/33767813) vxp_playlist = self._parse_json( self._search_regex( r']+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)', webpage, 'playlist data'), playlist_id) playlist_medias = [] for item in vxp_playlist: media = item.get('media') if not media: continue playlist_medias.append(media) # Download single video if found media with asset id matching the video id from URL if item.get('advert', {}).get('assetId') == playlist_id: medias = [media] break # Fallback to the whole playlist if not medias: medias = playlist_medias entries = [] for num, media_meta in enumerate(medias, start=1): formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) if not formats: continue self._sort_formats(formats) video_id = media_meta.get('externalId') if not video_id: video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num) title = media_meta.get('caption') if not title: title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num) duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration')) images = [] for image in media_meta.get('images', {}).values(): images.extend(image.values()) if 'image' in media_meta: images.append(media_meta['image']) thumbnails = [{ 'url': image.get('href'), 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), } for image in images] entries.append({ 'id': video_id, 'title': title, 'thumbnails': thumbnails, 'duration': duration, 'timestamp': timestamp, 'formats': formats, 'subtitles': subtitles, }) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) class BBCCoUkArticleIE(InfoExtractor): _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' IE_NAME = 'bbc.co.uk:article' IE_DESC = 'BBC articles' _TEST = { 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer', 'info_dict': { 'id': '3jNQLTMrPlYGTBn0WV6M2MS', 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four', 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.', }, 'playlist_count': 4, 'add_ie': ['BBCCoUk'], } def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) title = self._og_search_title(webpage) description = self._og_search_description(webpage).strip() entries = [self.url_result(programme_url) for programme_url in re.findall( r']+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] return self.playlist_result(entries, playlist_id, title, description) youtube-dl/youtube_dl/extractor/generic.py0000644000000000000000000024030112662564617020040 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import os import re import sys from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( compat_etree_fromstring, compat_urllib_parse_unquote, compat_urlparse, compat_xml_parse_error, ) from ..utils import ( determine_ext, ExtractorError, float_or_none, HEADRequest, is_html, orderedSet, sanitized_Request, smuggle_url, unescapeHTML, unified_strdate, unsmuggle_url, UnsupportedError, url_basename, xpath_text, ) from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, ) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE from .tvc import TVCIE from .sportbox import SportBoxEmbedIE from .smotri import SmotriIE from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE from .pladform import PladformIE from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE class GenericIE(InfoExtractor): IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = 'generic' _TESTS = [ # Direct link to a video { 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', 'md5': '67d406c2bcb6af27fa886f31aa934bbe', 'info_dict': { 'id': 'trailer', 'ext': 'mp4', 'title': 'trailer', 'upload_date': '20100513', } }, # Direct link to media delivered compressed (until Accept-Encoding is *) { 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', 'md5': '128c42e68b13950268b648275386fc74', 'info_dict': { 'id': 'FictionJunction-Parallel_Hearts', 'ext': 'flac', 'title': 'FictionJunction-Parallel_Hearts', 'upload_date': '20140522', }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' ] }, # Direct download with broken HEAD { 'url': 'http://ai-radio.org:8000/radio.opus', 'info_dict': { 'id': 'radio', 'ext': 'opus', 'title': 'radio', }, 'params': { 'skip_download': True, # infinite live stream }, 'expected_warnings': [ r'501.*Not Implemented' ], }, # Direct link with incorrect MIME type { 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', 'md5': '4ccbebe5f36706d85221f204d7eb5913', 'info_dict': { 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', 'id': '5_Lennart_Poettering_-_Systemd', 'ext': 'webm', 'title': '5_Lennart_Poettering_-_Systemd', 'upload_date': '20141120', }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' ] }, # RSS feed { 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', 'info_dict': { 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', 'title': 'Zero Punctuation', 'description': 're:.*groundbreaking video review series.*' }, 'playlist_mincount': 11, }, # RSS feed with enclosure { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'info_dict': { 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', 'ext': 'm4v', 'upload_date': '20150228', 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', } }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', 'info_dict': { 'id': 'smil', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', 'upload_date': '20130627', 'formats': 'mincount:16', 'subtitles': 'mincount:1', }, 'params': { 'force_generic_extractor': True, 'skip_download': True, }, }, # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html { 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', 'info_dict': { 'id': 'hds', 'ext': 'flv', 'title': 'hds', 'formats': 'mincount:1', }, 'params': { 'skip_download': True, }, }, # SMIL from https://www.restudy.dk/video/play/id/1637 { 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', 'info_dict': { 'id': 'video_1637', 'ext': 'flv', 'title': 'video_1637', 'formats': 'mincount:3', }, 'params': { 'skip_download': True, }, }, # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm { 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', 'info_dict': { 'id': 'smil-service', 'ext': 'flv', 'title': 'smil-service', 'formats': 'mincount:1', }, 'params': { 'skip_download': True, }, }, # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 { 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', 'info_dict': { 'id': '4719370', 'ext': 'mp4', 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', 'formats': 'mincount:3', }, 'params': { 'skip_download': True, }, }, # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html { 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', 'info_dict': { 'id': 'mZlp2ctYIUEB', 'ext': 'mp4', 'title': 'Tikibad ontruimd wegens brand', 'description': 'md5:05ca046ff47b931f9b04855015e163a4', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 33, }, 'params': { 'skip_download': True, }, }, # MPD from http://dash-mse-test.appspot.com/media.html { 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', 'info_dict': { 'id': 'car-20120827-manifest', 'ext': 'mp4', 'title': 'car-20120827-manifest', 'formats': 'mincount:9', }, 'params': { 'format': 'bestvideo', }, }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', 'info_dict': { 'id': 'cmQHVoWB5FY', 'ext': 'mp4', 'upload_date': '20130224', 'uploader_id': 'TheVerge', 'description': 're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', }, 'params': { 'skip_download': False, } }, { # redirect in Refresh HTTP header 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', 'info_dict': { 'id': 'pO8h3EaFRdo', 'ext': 'mp4', 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', 'upload_date': '20150917', 'uploader_id': 'brtvofficial', 'uploader': 'Boiler Room', }, 'params': { 'skip_download': False, }, }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', 'info_dict': { 'id': '13601338388002', 'ext': 'mp4', 'uploader': 'www.hodiho.fr', 'title': 'R\u00e9gis plante sa Jeep', } }, # bandcamp page with custom domain { 'add_ie': ['Bandcamp'], 'url': 'http://bronyrock.com/track/the-pony-mash', 'info_dict': { 'id': '3235767654', 'ext': 'mp3', 'title': 'The Pony Mash', 'uploader': 'M_Pallante', }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, # embedded brightcove video # it also tests brightcove videos that need to set the 'Referer' in the # http requests { 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 'info_dict': { 'id': '2765128793001', 'ext': 'mp4', 'title': 'Le cours de bourse : l’analyse technique', 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', 'uploader': 'BFM BUSINESS', }, 'params': { 'skip_download': True, }, }, { # https://github.com/rg3/youtube-dl/issues/2253 'url': 'http://bcove.me/i6nfkrc3', 'md5': '0ba9446db037002366bab3b3eb30c88c', 'info_dict': { 'id': '3101154703001', 'ext': 'mp4', 'title': 'Still no power', 'uploader': 'thestar.com', 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', }, 'add_ie': ['BrightcoveLegacy'], }, { 'url': 'http://www.championat.com/video/football/v/87/87499.html', 'md5': 'fb973ecf6e4a78a67453647444222983', 'info_dict': { 'id': '3414141473001', 'ext': 'mp4', 'title': 'Видео. Удаление Дзагоева (ЦСКА)', 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', 'uploader': 'Championat', }, }, { # https://github.com/rg3/youtube-dl/issues/3541 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', 'info_dict': { 'id': '3866516442001', 'ext': 'mp4', 'title': 'Leer mij vrouwen kennen: Aflevering 1', 'description': 'Leer mij vrouwen kennen: Aflevering 1', 'uploader': 'SBS Broadcasting', }, 'skip': 'Restricted to Netherlands', 'params': { 'skip_download': True, # m3u8 download }, }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', 'md5': '166dd577b433b4d4ebfee10b0824d8ff', 'info_dict': { 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 'ext': 'mp4', 'title': '2cc213299525360.mov', # that's what we get 'duration': 238.231, }, 'add_ie': ['Ooyala'], }, { # ooyala video embedded with http://player.ooyala.com/iframe.js 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/', 'info_dict': { 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB', 'ext': 'mp4', 'title': '"Steve Jobs: Man in the Machine" trailer', 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', 'duration': 135.427, }, 'params': { 'skip_download': True, }, }, # multiple ooyala embeds on SBN network websites { 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', 'info_dict': { 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok', 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com', }, 'playlist_mincount': 3, 'params': { 'skip_download': True, }, 'add_ie': ['Ooyala'], }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', 'info_dict': { 'id': '9ODmcdjQcHQ', 'ext': 'mp4', 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second', 'upload_date': '20140225', 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff', 'uploader': 'Tested', 'uploader_id': 'testedcom', }, # No need to test YoutubeIE here 'params': { 'skip_download': True, }, }, # funnyordie embed { 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', 'info_dict': { 'id': '18e820ec3f', 'ext': 'mp4', 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama', 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', }, }, # RUTV embed { 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', 'info_dict': { 'id': '776940', 'ext': 'mp4', 'title': 'Охотское море стало целиком российским', 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43', }, 'params': { # m3u8 download 'skip_download': True, }, }, # TVC embed { 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/', 'info_dict': { 'id': '55304', 'ext': 'mp4', 'title': 'Дошкольное воспитание', }, }, # SportBox embed { 'url': 'http://www.vestifinance.ru/articles/25753', 'info_dict': { 'id': '25753', 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"', }, 'playlist': [{ 'info_dict': { 'id': '370908', 'title': 'Госзаказ. День 3', 'ext': 'mp4', } }, { 'info_dict': { 'id': '370905', 'title': 'Госзаказ. День 2', 'ext': 'mp4', } }, { 'info_dict': { 'id': '370902', 'title': 'Госзаказ. День 1', 'ext': 'mp4', } }], 'params': { # m3u8 download 'skip_download': True, }, }, # Myvi.ru embed { 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1', 'info_dict': { 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e', 'ext': 'mp4', 'title': 'Ужастики, русский трейлер (2015)', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 153, } }, # XHamster embed { 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', 'info_dict': { 'id': 'showthread', 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', }, 'playlist_mincount': 7, }, # Embedded TED video { 'url': 'http://en.support.wordpress.com/videos/ted-talks/', 'md5': '65fdff94098e4a607385a60c5177c638', 'info_dict': { 'id': '1969', 'ext': 'mp4', 'title': 'Hidden miracles of the natural world', 'uploader': 'Louie Schwartzberg', 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, # Embedded Ustream video { 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm', 'md5': '27b99cdb639c9b12a79bca876a073417', 'info_dict': { 'id': '45734260', 'ext': 'flv', 'uploader': 'AU SPA: The NSA and Privacy', 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman' } }, # nowvideo embed hidden behind percent encoding { 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/', 'md5': '2baf4ddd70f697d94b1c18cf796d5107', 'info_dict': { 'id': '06e53103ca9aa', 'ext': 'flv', 'title': 'Macross Episode 001 Watch Macross Episode 001 onl', 'description': 'No description', }, }, # arte embed { 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html', 'md5': '7653032cbb25bf6c80d80f217055fa43', 'info_dict': { 'id': '048195-004_PLUS7-F', 'ext': 'flv', 'title': 'X:enius', 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168', 'upload_date': '20140320', }, 'params': { 'skip_download': 'Requires rtmpdump' } }, # francetv embed { 'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero', 'info_dict': { 'id': 'EV_30231', 'ext': 'mp4', 'title': 'Alcaline, le concert avec Calogero', 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', 'upload_date': '20150226', 'timestamp': 1424989860, 'duration': 5400, }, 'params': { # m3u8 downloads 'skip_download': True, }, 'expected_warnings': [ 'Forbidden' ] }, # Condé Nast embed { 'url': 'http://www.wired.com/2014/04/honda-asimo/', 'md5': 'ba0dfe966fa007657bd1443ee672db0f', 'info_dict': { 'id': '53501be369702d3275860000', 'ext': 'mp4', 'title': 'Honda’s New Asimo Robot Is More Human Than Ever', } }, # Dailymotion embed { 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/', 'md5': '441aeeb82eb72c422c7f14ec533999cd', 'info_dict': { 'id': 'k2mm4bCdJ6CQ2i7c8o2', 'ext': 'mp4', 'title': 'Le Zap de Spi0n n°216 - Zapping du Web', 'uploader': 'Spi0n', }, 'add_ie': ['Dailymotion'], }, # YouTube embed { 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', 'info_dict': { 'id': 'FXRb4ykk4S0', 'ext': 'mp4', 'title': 'The NBL Auction 2014', 'uploader': 'BADMINTON England', 'uploader_id': 'BADMINTONEvents', 'upload_date': '20140603', 'description': 'md5:9ef128a69f1e262a700ed83edb163a73', }, 'add_ie': ['Youtube'], 'params': { 'skip_download': True, } }, # MTVSercices embed { 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too', 'md5': '35727f82f58c76d996fc188f9755b0d5', 'info_dict': { 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9', 'ext': 'mp4', 'title': 'Review', 'description': 'Mario\'s life in the fast lane has never looked so good.', }, }, # YouTube embed via { 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', 'info_dict': { 'id': '4vAffPZIT44', 'ext': 'mp4', 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!', 'uploader': 'Gameloft', 'uploader_id': 'gameloft', 'upload_date': '20140828', 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4', }, 'params': { 'skip_download': True, } }, # Camtasia studio { 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', 'playlist': [{ 'md5': '0c5e352edabf715d762b0ad4e6d9ee67', 'info_dict': { 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', 'ext': 'flv', 'duration': 2235.90, } }, { 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', 'info_dict': { 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP', 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', 'ext': 'flv', 'duration': 2235.93, } }], 'info_dict': { 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', } }, # Flowplayer { 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', 'md5': '9d65602bf31c6e20014319c7d07fba27', 'info_dict': { 'id': '5123ea6d5e5a7', 'ext': 'mp4', 'age_limit': 18, 'uploader': 'www.handjobhub.com', 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', } }, # Multiple brightcove videos # https://github.com/rg3/youtube-dl/issues/2283 { 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', 'info_dict': { 'id': 'always-never', 'title': 'Always / Never - The New Yorker', }, 'playlist_count': 3, 'params': { 'extract_flat': False, 'skip_download': True, } }, # MLB embed { 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', 'md5': '96f09a37e44da40dd083e12d9a683327', 'info_dict': { 'id': '33322633', 'ext': 'mp4', 'title': 'Ump changes call to ball', 'description': 'md5:71c11215384298a172a6dcb4c2e20685', 'duration': 48, 'timestamp': 1401537900, 'upload_date': '20140531', 'thumbnail': 're:^https?://.*\.jpg$', }, }, # Wistia embed { 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', 'md5': '8788b683c777a5cf25621eaf286d0c23', 'info_dict': { 'id': '1cfaf6b7ea', 'ext': 'mov', 'title': 'md5:51364a8d3d009997ba99656004b5e20d', 'duration': 643.0, 'filesize': 182808282, 'uploader': 'education-portal.com', }, }, { 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', 'info_dict': { 'id': 'uxjb0lwrcz', 'ext': 'mp4', 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', 'duration': 1715.0, 'uploader': 'thoughtworks.wistia.com', }, }, # Soundcloud embed { 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', 'info_dict': { 'id': '174391317', 'ext': 'mp3', 'description': 'md5:ff867d6b555488ad3c52572bb33d432c', 'uploader': 'Sophos Security', 'title': 'Chet Chat 171 - Oct 29, 2014', 'upload_date': '20141029', } }, # Livestream embed { 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', 'info_dict': { 'id': '67864563', 'ext': 'flv', 'upload_date': '20141112', 'title': 'Rosetta #CometLanding webcast HL 10', } }, # LazyYT { 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', 'info_dict': { 'id': '1986', 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', }, 'playlist_mincount': 2, }, # Cinchcast embed { 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', 'info_dict': { 'id': '7141703', 'ext': 'mp3', 'upload_date': '20141126', 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', } }, # Cinerama player { 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm', 'info_dict': { 'id': '730m_DandD_1901_512k', 'ext': 'mp4', 'uploader': 'www.abc.net.au', 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015', } }, # embedded viddler video { 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597', 'info_dict': { 'id': '4d03aad9', 'ext': 'mp4', 'uploader': 'deadspin', 'title': 'WALL-TO-GORTAT', 'timestamp': 1422285291, 'upload_date': '20150126', }, 'add_ie': ['Viddler'], }, # Libsyn embed { 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve', 'info_dict': { 'id': '3377616', 'ext': 'mp3', 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", 'description': 'md5:601cb790edd05908957dae8aaa866465', 'upload_date': '20150220', }, }, # jwplayer YouTube { 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/', 'info_dict': { 'id': 'Mrj4DVp2zeA', 'ext': 'mp4', 'upload_date': '20150212', 'uploader': 'The National Archives UK', 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6', 'uploader_id': 'NationalArchives08', 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue', }, }, # rtl.nl embed { 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', 'playlist_mincount': 5, 'info_dict': { 'id': 'aanslagen-kopenhagen', 'title': 'Aanslagen Kopenhagen | RTL Nieuws', } }, # Zapiks embed { 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html', 'info_dict': { 'id': '118046', 'ext': 'mp4', 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', } }, # Kaltura embed { 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15', 'info_dict': { 'id': '1_eergr3h1', 'ext': 'mp4', 'upload_date': '20150226', 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com', 'timestamp': int, 'title': 'John Carlson Postgame 2/25/15', }, }, # Kaltura embed (different embed code) { 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', 'info_dict': { 'id': '1_a52wc67y', 'ext': 'flv', 'upload_date': '20150127', 'uploader_id': 'PremierMedia', 'timestamp': int, 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014', }, }, # Kaltura embed protected with referrer { 'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero', 'info_dict': { 'id': '1_g4fbemnq', 'ext': 'mp4', 'title': 'Violetta - Achter De Schermen - Ruggero', 'description': 'Achter de schermen met Ruggero', 'timestamp': 1435133761, 'upload_date': '20150624', 'uploader_id': 'echojecka', }, }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', 'info_dict': { 'id': '227304', 'ext': 'mp4', 'title': 'Навальный вышел на свободу', 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 87, 'view_count': int, 'age_limit': 0, }, }, # ClipYou (Eagle.Platform) embed (custom URL) { 'url': 'http://muz-tv.ru/play/7129/', 'info_dict': { 'id': '12820', 'ext': 'mp4', 'title': "'O Sole Mio", 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 216, 'view_count': int, }, }, # Pladform embed { 'url': 'http://muz-tv.ru/kinozal/view/7400/', 'info_dict': { 'id': '100183293', 'ext': 'mp4', 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 694, 'age_limit': 0, }, }, # Playwire embed { 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html', 'info_dict': { 'id': '3519514', 'ext': 'mp4', 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer', 'thumbnail': 're:^https?://.*\.png$', 'duration': 45.115, }, }, # 5min embed { 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', 'md5': '4c6f127a30736b59b3e2c19234ee2bf7', 'info_dict': { 'id': '518726732', 'ext': 'mp4', 'title': 'Facebook Creates "On This Day" | Crunch Report', }, }, # SVT embed { 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', 'info_dict': { 'id': '2900353', 'ext': 'flv', 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', 'duration': 27, 'age_limit': 0, }, }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', 'info_dict': { 'id': '8RUoRhRi', 'ext': 'mp4', 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!", 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f', 'timestamp': 1428207000, 'upload_date': '20150405', 'uploader': 'Heather', }, }, # Crooks and Liars external embed { 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/', 'info_dict': { 'id': 'MTE3MjUtMzQ2MzA', 'ext': 'mp4', 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5', 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec', 'timestamp': 1265032391, 'upload_date': '20100201', 'uploader': 'Heather', }, }, # NBC Sports vplayer embed { 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', 'info_dict': { 'id': 'ln7x1qSThw4k', 'ext': 'flv', 'title': "PFT Live: New leader in the 'new-look' defense", 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', }, }, # UDN embed { 'url': 'http://www.udn.com/news/story/7314/822787', 'md5': 'fd2060e988c326991037b9aff9df21a6', 'info_dict': { 'id': '300346', 'ext': 'mp4', 'title': '中一中男師變性 全校師生力挺', 'thumbnail': 're:^https?://.*\.jpg$', } }, # Ooyala embed { 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', 'info_dict': { 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', 'ext': 'mp4', 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.', 'title': 'This is what separates the Excel masters from the wannabes', 'duration': 191.933, }, 'params': { # m3u8 downloads 'skip_download': True, } }, # Contains a SMIL manifest { 'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html', 'info_dict': { 'id': 'file', 'ext': 'flv', 'title': '+ Football: Lottery Champions League Europe', 'uploader': 'www.telewebion.com', }, 'params': { # rtmpe downloads 'skip_download': True, } }, # Brightcove URL in single quotes { 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', 'md5': '4ae374f1f8b91c889c4b9203c8c752af', 'info_dict': { 'id': '4255764656001', 'ext': 'mp4', 'title': 'SN Presents: Russell Martin, World Citizen', 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', 'uploader': 'Rogers Sportsnet', }, }, # Dailymotion Cloud video { 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', 'md5': '49444254273501a64675a7e68c502681', 'info_dict': { 'id': '5585de919473990de4bee11b', 'ext': 'mp4', 'title': 'Le débat', 'thumbnail': 're:^https?://.*\.jpe?g$', } }, # OnionStudios embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', 'info_dict': { 'id': '2855', 'ext': 'mp4', 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', 'thumbnail': 're:^https?://.*\.jpe?g$', 'uploader': 'ClickHole', 'uploader_id': 'clickhole', } }, # SnagFilms embed { 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html', 'info_dict': { 'id': '74849a00-85a9-11e1-9660-123139220831', 'ext': 'mp4', 'title': '#whilewewatch', } }, # AdobeTVVideo embed { 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', 'md5': '43662b577c018ad707a63766462b1e87', 'info_dict': { 'id': '2456', 'ext': 'mp4', 'title': 'New experience with Acrobat DC', 'description': 'New experience with Acrobat DC', 'duration': 248.667, }, }, # ScreenwaveMedia embed { 'url': 'http://www.thecinemasnob.com/the-cinema-snob/a-nightmare-on-elm-street-2-freddys-revenge1', 'md5': '24ace5baba0d35d55c6810b51f34e9e0', 'info_dict': { 'id': 'cinemasnob-55d26273809dd', 'ext': 'mp4', 'title': 'cinemasnob', }, }, # BrightcoveInPageEmbed embed { 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', 'info_dict': { 'id': '4238694884001', 'ext': 'flv', 'title': 'Tabletop: Dread, Last Thoughts', 'description': 'Tabletop: Dread, Last Thoughts', 'duration': 51690, }, }, # JWPlayer with M3U8 { 'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video', 'info_dict': { 'id': 'playlist', 'ext': 'mp4', 'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ', 'uploader': 'ren.tv', }, 'params': { # m3u8 downloads 'skip_download': True, } } ] def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) def _extract_rss(self, url, video_id, doc): playlist_title = doc.find('./channel/title').text playlist_desc_el = doc.find('./channel/description') playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text entries = [] for it in doc.findall('./channel/item'): next_url = xpath_text(it, 'link', fatal=False) if not next_url: enclosure_nodes = it.findall('./enclosure') for e in enclosure_nodes: next_url = e.attrib.get('url') if next_url: break if not next_url: continue entries.append({ '_type': 'url', 'url': next_url, 'title': it.find('title').text, }) return { '_type': 'playlist', 'id': url, 'title': playlist_title, 'description': playlist_desc, 'entries': entries, } def _extract_camtasia(self, url, video_id, webpage): """ Returns None if no camtasia video can be found. """ camtasia_cfg = self._search_regex( r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);', webpage, 'camtasia configuration file', default=None) if camtasia_cfg is None: return None title = self._html_search_meta('DC.title', webpage, fatal=True) camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) camtasia_cfg = self._download_xml( camtasia_url, video_id, note='Downloading camtasia configuration', errnote='Failed to download camtasia configuration') fileset_node = camtasia_cfg.find('./playlist/array/fileset') entries = [] for n in fileset_node.getchildren(): url_n = n.find('./uri') if url_n is None: continue entries.append({ 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], 'title': '%s - %s' % (title, n.tag), 'url': compat_urlparse.urljoin(url, url_n.text), 'duration': float_or_none(n.find('./duration').text), }) return { '_type': 'playlist', 'entries': entries, 'title': title, } def _real_extract(self, url): if url.startswith('//'): return { '_type': 'url', 'url': self.http_scheme() + url, } parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: default_search = self._downloader.params.get('default_search') if default_search is None: default_search = 'fixup_error' if default_search in ('auto', 'auto_warning', 'fixup_error'): if '/' in url: self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) elif default_search != 'fixup_error': if default_search == 'auto_warning': if re.match(r'^(?:url|URL)$', url): raise ExtractorError( 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url, expected=True) else: self._downloader.report_warning( 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url) return self.url_result('ytsearch:' + url) if default_search in ('error', 'fixup_error'): raise ExtractorError( '%r is not a valid URL. ' 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' % (url, url), expected=True) else: if ':' not in default_search: default_search += ':' return self.url_result(default_search + url) url, smuggled_data = unsmuggle_url(url) force_videoid = None is_intentional = smuggled_data and smuggled_data.get('to_generic') if smuggled_data and 'force_videoid' in smuggled_data: force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) self.to_screen('%s: Requesting header' % video_id) head_req = HEADRequest(url) head_response = self._request_webpage( head_req, video_id, note=False, errnote='Could not send HEAD request to %s' % url, fatal=False) if head_response is not False: # Check for redirect new_url = head_response.geturl() if url != new_url: self.report_following_redirect(new_url) if force_videoid: new_url = smuggle_url( new_url, {'force_videoid': force_videoid}) return self.url_result(new_url) full_response = None if head_response is False: request = sanitized_Request(url) request.add_header('Accept-Encoding', '*') full_response = self._request_webpage(request, video_id) head_response = full_response # Check for direct link to a video content_type = head_response.headers.get('Content-Type', '') m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P.+)$', content_type) if m: upload_date = unified_strdate( head_response.headers.get('Last-Modified')) formats = [] if m.group('format_id').endswith('mpegurl'): formats = self._extract_m3u8_formats(url, video_id, 'mp4') else: formats = [{ 'format_id': m.group('format_id'), 'url': url, 'vcodec': 'none' if m.group('type') == 'audio' else None }] return { 'id': video_id, 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'direct': True, 'formats': formats, 'upload_date': upload_date, } if not self._downloader.params.get('test', False) and not is_intentional: force = self._downloader.params.get('force_generic_extractor', False) self._downloader.report_warning( '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) if not full_response: request = sanitized_Request(url) # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) # making it impossible to download only chunk of the file (yet we need only 512kB to # test whether it's HTML or not). According to youtube-dl default Accept-Encoding # that will always result in downloading the whole file that is not desirable. # Therefore for extraction pass we have to override Accept-Encoding to any in order # to accept raw bytes and being able to download only a chunk. # It may probably better to solve this by checking Content-Type for application/octet-stream # after HEAD request finishes, but not sure if we can rely on this. request.add_header('Accept-Encoding', '*') full_response = self._request_webpage(request, video_id) # Maybe it's a direct link to a video? # Be careful not to download the whole thing! first_bytes = full_response.read(512) if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') upload_date = unified_strdate( head_response.headers.get('Last-Modified')) return { 'id': video_id, 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'direct': True, 'url': url, 'upload_date': upload_date, } webpage = self._webpage_read_content( full_response, url, video_id, prefix=first_bytes) self.report_extraction(video_id) # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? try: doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): return self._parse_smil(doc, url, video_id) elif doc.tag == '{http://xspf.org/ns/0/}playlist': return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): return { 'id': video_id, 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'formats': self._parse_mpd_formats( doc, video_id, mpd_base_url=url.rpartition('/')[0]), } except compat_xml_parse_error: pass # Is it a Camtasia project? camtasia_res = self._extract_camtasia(url, video_id, webpage) if camtasia_res is not None: return camtasia_res # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/rg3/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way webpage = compat_urllib_parse_unquote(webpage) # it's tempting to parse this further, but you would # have to take into account all the variations like # Video Title - Site Name # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical video_title = self._html_search_regex( r'(?s)(.*?)', webpage, 'video title', default='video') # Try to detect age limit automatically age_limit = self._rta_search(webpage) # And then there are the jokers who advertise that they use RTA, # but actually don't. AGE_LIMIT_MARKERS = [ r'Proudly Labeled RTA', ] if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): age_limit = 18 # video uploader is domain name video_uploader = self._search_regex( r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') # Helper method def _playlist_from_matches(matches, getter=None, ie=None): urlrs = orderedSet( self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) for m in matches) return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: self.to_screen('Brightcove video detected.') entries = [{ '_type': 'url', 'url': smuggle_url(bc_url, {'Referer': url}), 'ie_key': 'BrightcoveLegacy' } for bc_url in bc_urls] return { '_type': 'playlist', 'title': video_title, 'id': video_id, 'entries': entries, } # Look for Brightcove New Studio embeds bc_urls = BrightcoveNewIE._extract_urls(webpage) if bc_urls: return _playlist_from_matches(bc_urls, ie='BrightcoveNew') # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', webpage) if matches: return _playlist_from_matches(matches, ie='RtlNl') vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) if vimeo_url is not None: return self.url_result(vimeo_url) vid_me_embed_url = self._search_regex( r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', webpage, 'vid.me embed', default=None) if vid_me_embed_url is not None: return self.url_result(vid_me_embed_url, 'Vidme') # Look for embedded YouTube player matches = re.findall(r'''(?x) (?: ]+?src=| data-video-url=| ]+?src=| embedSWF\(?:\s*| new\s+SWFObject\( ) (["\']) (?P(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ (?:embed|v|p)/.+?) \1''', webpage) if matches: return _playlist_from_matches( matches, lambda m: unescapeHTML(m[1])) # Look for lazyYT YouTube embed matches = re.findall( r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) if matches: return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) # Look for embedded Dailymotion player matches = re.findall( r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) if matches: return _playlist_from_matches( matches, lambda m: unescapeHTML(m[1])) # Look for embedded Dailymotion playlist player (#3822) m = re.search( r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) if m: playlists = re.findall( r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) if playlists: return _playlist_from_matches( playlists, lambda p: '//dailymotion.com/playlist/%s' % p) # Look for embedded Wistia player match = re.search( r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) if match: embed_url = self._proto_relative_url( unescapeHTML(match.group('url'))) return { '_type': 'url_transparent', 'url': embed_url, 'ie_key': 'Wistia', 'uploader': video_uploader, 'title': video_title, 'id': video_id, } match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P[^"\']+)', webpage) if match: return { '_type': 'url_transparent', 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')), 'ie_key': 'Wistia', 'uploader': video_uploader, 'title': video_title, 'id': match.group('id') } # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: return self.url_result(svt_url, 'SVT') # Look for embedded condenast player matches = re.findall( r']*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: burl = unescapeHTML(mobj.group(1)) # Don't set the extractor because it can be a track url or an album return self.url_result(burl) # Look for embedded Vevo player mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) if mobj is not None: return self.url_result(mobj.group('url')) # Look for embedded Viddler player mobj = re.search( r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', webpage) if mobj is not None: return self.url_result(mobj.group('url')) # Look for NYTimes player mobj = re.search( r']+src=(["\'])(?P(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', webpage) if mobj is not None: return self.url_result(mobj.group('url')) # Look for Libsyn player mobj = re.search( r']+src=(["\'])(?P(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) if mobj is not None: return self.url_result(mobj.group('url')) # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage) or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P.{32})[\'"]', webpage)) if mobj is not None: return OoyalaIE._build_url_result(smuggle_url(mobj.group('ec'), {'domain': url})) # Look for multiple Ooyala embeds on SBN network websites mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) if mobj is not None: embeds = self._parse_json(mobj.group(1), video_id, fatal=False) if embeds: return _playlist_from_matches( embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') # Look for Aparat videos mobj = re.search(r'', webpage): url = self._search_regex( r'src=(["\'])(?P.+?partnerplayer.+?)\1', iframe, 'player URL', default=None, group='url') if url: break mobj = re.match(self._VALID_URL, url) player_id = mobj.group('player_id') if not display_id: display_id = player_id if player_id: player_page = self._download_webpage( url, display_id, note='Downloading player page', errnote='Could not download player page') video_id = self._search_regex( r'my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P\d+)\.shtml.*?' _TESTS = [{ 'note': 'This video is available only in Mainland China', 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', 'md5': '29175c8cadd8b5cc4055001e85d6b372', 'info_dict': { 'id': '382479172', 'ext': 'mp4', 'title': 'MV:Far East Movement《The Illest》', }, 'skip': 'On available in China', }, { 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', 'md5': '699060e75cf58858dd47fb9c03c42cfb', 'info_dict': { 'id': '409385080', 'ext': 'mp4', 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', } }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', 'md5': '9bf34be48f2f4dadcb226c74127e203c', 'info_dict': { 'id': '78693464', 'ext': 'mp4', 'title': '【爱范品】第31期:MWC见不到的奇葩手机', } }, { 'note': 'Multipart video', 'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', 'info_dict': { 'id': '78910339', 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', }, 'playlist': [{ 'md5': 'bdbfb8f39924725e6589c146bc1883ad', 'info_dict': { 'id': '78910339_part1', 'ext': 'mp4', 'duration': 294, 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', 'info_dict': { 'id': '78910339_part2', 'ext': 'mp4', 'duration': 300, 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { 'md5': '8407e634175fdac706766481b9443450', 'info_dict': { 'id': '78910339_part3', 'ext': 'mp4', 'duration': 150, 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }] }, { 'note': 'Video with title containing dash', 'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', 'info_dict': { 'id': '78932792', 'ext': 'mp4', 'title': 'youtube-dl testing video', }, 'params': { 'skip_download': True } }] def _real_extract(self, url): def _fetch_data(vid_id, mytv=False): if mytv: base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid=' else: base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' req = sanitized_Request(base_data_url + vid_id) cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') if cn_verification_proxy: req.add_header('Ytdl-request-proxy', cn_verification_proxy) return self._download_json( req, video_id, 'Downloading JSON data for %s' % vid_id) mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') mytv = mobj.group('mytv') is not None webpage = self._download_webpage(url, video_id) title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) vid = self._html_search_regex( r'var vid ?= ?["\'](\d+)["\']', webpage, 'video path') vid_data = _fetch_data(vid, mytv) if vid_data['play'] != 1: if vid_data.get('status') == 12: raise ExtractorError( 'Sohu said: There\'s something wrong in the video.', expected=True) else: raise ExtractorError( 'Sohu said: The video is only licensed to users in Mainland China.', expected=True) formats_json = {} for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): vid_id = vid_data['data'].get('%sVid' % format_id) if not vid_id: continue vid_id = compat_str(vid_id) formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv) part_count = vid_data['data']['totalBlocks'] playlist = [] for i in range(part_count): formats = [] for format_id, format_data in formats_json.items(): allot = format_data['allot'] data = format_data['data'] clips_url = data['clipsURL'] su = data['su'] video_url = 'newflv.sohu.ccgslb.net' cdnId = None retries = 0 while 'newflv.sohu.ccgslb.net' in video_url: params = { 'prot': 9, 'file': clips_url[i], 'new': su[i], 'prod': 'flash', 'rb': 1, } if cdnId is not None: params['idc'] = cdnId download_note = 'Downloading %s video URL part %d of %d' % ( format_id, i + 1, part_count) if retries > 0: download_note += ' (retry #%d)' % retries part_info = self._parse_json(self._download_webpage( 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)), video_id, download_note), video_id) video_url = part_info['url'] cdnId = part_info.get('nid') retries += 1 if retries > 5: raise ExtractorError('Failed to get video URL') formats.append({ 'url': video_url, 'format_id': format_id, 'filesize': data['clipsBytes'][i], 'width': data['width'], 'height': data['height'], 'fps': data['fps'], }) self._sort_formats(formats) playlist.append({ 'id': '%s_part%d' % (video_id, i + 1), 'title': title, 'duration': vid_data['data']['clipsDuration'][i], 'formats': formats, }) if len(playlist) == 1: info = playlist[0] info['id'] = video_id else: info = { '_type': 'multi_video', 'entries': playlist, 'id': video_id, 'title': title, } return info youtube-dl/youtube_dl/extractor/heise.py0000644000000000000000000000546012641030331017500 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, parse_iso8601, ) class HeiseIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?:www\.)?heise\.de/video/artikel/ .+?(?P[0-9]+)\.html(?:$|[?#]) ''' _TEST = { 'url': ( 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html' ), 'md5': 'ffed432483e922e88545ad9f2f15d30e', 'info_dict': { 'id': '2404147', 'ext': 'mp4', 'title': ( "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone" ), 'format_id': 'mp4_720p', 'timestamp': 1411812600, 'upload_date': '20140927', 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', 'thumbnail': 're:^https?://.*\.jpe?g$', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) container_id = self._search_regex( r'
    (?:revision3|testtube|animalist)\.com)/(?P[^/]+(?:/[^/?#]+)?)' _TESTS = [{ 'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016', 'md5': 'd94a72d85d0a829766de4deb8daaf7df', 'info_dict': { 'id': '73034', 'display_id': 'technobuffalo/5-google-predictions-for-2016', 'ext': 'webm', 'title': '5 Google Predictions for 2016', 'description': 'Google had a great 2015, but it\'s already time to look ahead. Here are our five predictions for 2016.', 'upload_date': '20151228', 'timestamp': 1451325600, 'duration': 187, 'uploader': 'TechnoBuffalo', 'uploader_id': 'technobuffalo', } }, { 'url': 'http://testtube.com/brainstuff', 'info_dict': { 'id': '251', 'title': 'BrainStuff', 'description': 'Whether the topic is popcorn or particle physics, you can count on the HowStuffWorks team to explore-and explain-the everyday science in the world around us on BrainStuff.', }, 'playlist_mincount': 93, }, { 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial', 'info_dict': { 'id': '60163', 'display_id': 'dnews/5-weird-ways-plants-can-eat-animals', 'duration': 275, 'ext': 'webm', 'title': '5 Weird Ways Plants Can Eat Animals', 'description': 'Why have some plants evolved to eat meat?', 'upload_date': '20150120', 'timestamp': 1421763300, 'uploader': 'DNews', 'uploader_id': 'dnews', }, }] _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s' _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' def _real_extract(self, url): domain, display_id = re.match(self._VALID_URL, url).groups() page_info = self._download_json( self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id) if page_info['data']['type'] == 'episode': episode_data = page_info['data'] video_id = compat_str(episode_data['video']['data']['id']) video_data = self._download_json( 'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id), video_id)['items'][0] formats = [] for vcodec, media in video_data['media'].items(): for quality_id, quality in media.items(): if quality_id == 'hls': formats.extend(self._extract_m3u8_formats( quality['url'], video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) else: formats.append({ 'url': quality['url'], 'format_id': '%s-%s' % (vcodec, quality_id), 'tbr': int_or_none(quality.get('bitrate')), 'vcodec': vcodec, }) self._sort_formats(formats) preference = qualities(['mini', 'small', 'medium', 'large']) thumbnails = [{ 'url': image_url, 'id': image_id, 'preference': preference(image_id) } for image_id, image_url in video_data.get('images', {}).items()] return { 'id': video_id, 'display_id': display_id, 'title': unescapeHTML(video_data['title']), 'description': unescapeHTML(video_data.get('summary')), 'timestamp': parse_iso8601(episode_data.get('publishTime'), ' '), 'author': episode_data.get('author'), 'uploader': video_data.get('show', {}).get('name'), 'uploader_id': video_data.get('show', {}).get('slug'), 'duration': int_or_none(video_data.get('duration')), 'thumbnails': thumbnails, 'formats': formats, } else: show_data = page_info['show']['data'] episodes_data = page_info['episodes']['data'] num_episodes = page_info['meta']['totalEpisodes'] processed_episodes = 0 entries = [] page_num = 1 while True: entries.extend([self.url_result( 'http://%s/%s/%s' % (domain, display_id, episode['slug'])) for episode in episodes_data]) processed_episodes += len(episodes_data) if processed_episodes == num_episodes: break page_num += 1 episodes_data = self._download_json(self._PAGE_DATA_TEMPLATE % ( domain, display_id + '/' + compat_str(page_num), domain), display_id)['episodes']['data'] return self.playlist_result( entries, compat_str(show_data['id']), show_data.get('name'), show_data.get('summary')) youtube-dl/youtube_dl/extractor/kankan.py0000644000000000000000000000331212660177411017654 0ustar rootrootfrom __future__ import unicode_literals import re import hashlib from .common import InfoExtractor _md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class KankanIE(InfoExtractor): _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P\d+)\.shtml' _TEST = { 'url': 'http://yinyue.kankan.com/vod/48/48863.shtml', 'md5': '29aca1e47ae68fc28804aca89f29507e', 'info_dict': { 'id': '48863', 'ext': 'flv', 'title': 'Ready To Go', }, 'skip': 'Only available from China', } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, 'video title') surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0) gcids = re.findall(r'http://.+?/.+?/(.+?)/', surls) gcid = gcids[-1] info_url = 'http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid video_info_page = self._download_webpage( info_url, video_id, 'Downloading video url info') ip = self._search_regex(r'ip:"(.+?)"', video_info_page, 'video url ip') path = self._search_regex(r'path:"(.+?)"', video_info_page, 'video url path') param1 = self._search_regex(r'param1:(\d+)', video_info_page, 'param1') param2 = self._search_regex(r'param2:(\d+)', video_info_page, 'param2') key = _md5('xl_mp43651' + param1 + param2) video_url = 'http://%s%s?key=%s&key1=%s' % (ip, path, key, param2) return { 'id': video_id, 'title': title, 'url': video_url, } youtube-dl/youtube_dl/extractor/arte.py0000644000000000000000000003271412662564617017366 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, ) from ..utils import ( find_xpath_attr, unified_strdate, get_element_by_attribute, int_or_none, NO_DEFAULT, qualities, ) # There are different sources of video in arte.tv, the extraction process # is different for each one. The videos usually expire in 7 days, so we can't # add tests. class ArteTvIE(InfoExtractor): _VALID_URL = r'http://videos\.arte\.tv/(?Pfr|de|en|es)/.*-(?P.*?)\.html' IE_NAME = 'arte.tv' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) lang = mobj.group('lang') video_id = mobj.group('id') ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') ref_xml_doc = self._download_xml( ref_xml_url, video_id, note='Downloading metadata') config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) config_xml_url = config_node.attrib['ref'] config = self._download_xml( config_xml_url, video_id, note='Downloading configuration') formats = [{ 'format_id': q.attrib['quality'], # The playpath starts at 'mp4:', if we don't manually # split the url, rtmpdump will incorrectly parse them 'url': q.text.split('mp4:', 1)[0], 'play_path': 'mp4:' + q.text.split('mp4:', 1)[1], 'ext': 'flv', 'quality': 2 if q.attrib['quality'] == 'hd' else 1, } for q in config.findall('./urls/url')] self._sort_formats(formats) title = config.find('.//name').text thumbnail = config.find('.//firstThumbnailUrl').text return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'formats': formats, } class ArteTVPlus7IE(InfoExtractor): IE_NAME = 'arte.tv:+7' _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?Pfr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P[^/]+)/(?P[^/?#&+])' @classmethod def _extract_url_info(cls, url): mobj = re.match(cls._VALID_URL, url) lang = mobj.group('lang') query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) if 'vid' in query: video_id = query['vid'][0] else: # This is not a real id, it can be for example AJT for the news # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal video_id = mobj.group('id') return video_id, lang def _real_extract(self, url): video_id, lang = self._extract_url_info(url) webpage = self._download_webpage(url, video_id) return self._extract_from_webpage(webpage, video_id, lang) def _extract_from_webpage(self, webpage, video_id, lang): patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') ids = (video_id, '') # some pages contain multiple videos (like # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), # so we first try to look for json URLs that contain the video id from # the 'vid' parameter. patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] json_url = self._html_search_regex( patterns, webpage, 'json vp url', default=None) if not json_url: def find_iframe_url(webpage, default=NO_DEFAULT): return self._html_search_regex( r']+src=(["\'])(?P.+\bjson_url=.+?)\1', webpage, 'iframe url', group='url', default=default) iframe_url = find_iframe_url(webpage, None) if not iframe_url: embed_url = self._html_search_regex( r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) if embed_url: player = self._download_json( embed_url, video_id, 'Downloading player page') iframe_url = find_iframe_url(player['html']) # en and es URLs produce react-based pages with different layout (e.g. # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) if not iframe_url: program = self._search_regex( r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', webpage, 'program', default=None) if program: embed_html = self._parse_json(program, video_id) if embed_html: iframe_url = find_iframe_url(embed_html['embed_html']) if iframe_url: json_url = compat_parse_qs( compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] if json_url: return self._extract_from_json_url(json_url, video_id, lang) # Differend kind of embed URL (e.g. # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) embed_url = self._search_regex( r']+src=(["\'])(?P.+?)\1', webpage, 'embed url', group='url') return self.url_result(embed_url) def _extract_from_json_url(self, json_url, video_id, lang): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] upload_date_str = player_info.get('shootingDate') if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] title = player_info['VTI'].strip() subtitle = player_info.get('VSU', '').strip() if subtitle: title += ' - %s' % subtitle info_dict = { 'id': player_info['VID'], 'title': title, 'description': player_info.get('VDE'), 'upload_date': unified_strdate(upload_date_str), 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), } qfunc = qualities(['HQ', 'MQ', 'EQ', 'SQ']) LANGS = { 'fr': 'F', 'de': 'A', 'en': 'E[ANG]', 'es': 'E[ESP]', } formats = [] for format_id, format_dict in player_info['VSR'].items(): f = dict(format_dict) versionCode = f.get('versionCode') langcode = LANGS.get(lang, lang) lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)] lang_pref = None if versionCode: matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) source_pref = 0 if versionCode is not None: # The original version with subtitles has lower relevance if re.match(r'VO-ST(F|A|E)', versionCode): source_pref -= 10 # The version with sourds/mal subtitles has also lower relevance elif re.match(r'VO?(F|A|E)-STM\1', versionCode): source_pref -= 9 format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, 'language_preference': lang_pref, 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')), 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), 'quality': qfunc(f.get('quality')), 'source_preference': source_pref, } if f.get('mediaType') == 'rtmp': format['url'] = f['streamer'] format['play_path'] = 'mp4:' + f['url'] format['ext'] = 'flv' else: format['url'] = f['url'] formats.append(format) self._check_formats(formats, video_id) self._sort_formats(formats) info_dict['formats'] = formats return info_dict # It also uses the arte_vp_url url from the webpage to extract the information class ArteTVCreativeIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:creative' _VALID_URL = r'https?://creative\.arte\.tv/(?Pfr|de|en|es)/(?:magazine?/)?(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', 'info_dict': { 'id': '72176', 'ext': 'mp4', 'title': 'Folge 2 - Corporate Design', 'upload_date': '20131004', }, }, { 'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion', 'info_dict': { 'id': '160676', 'ext': 'mp4', 'title': 'Monty Python live (mostly)', 'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n', 'upload_date': '20140805', } }] class ArteTVFutureIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:future' _VALID_URL = r'https?://future\.arte\.tv/(?Pfr|de|en|es)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://future.arte.tv/fr/info-sciences/les-ecrevisses-aussi-sont-anxieuses', 'info_dict': { 'id': '050940-028-A', 'ext': 'mp4', 'title': 'Les écrevisses aussi peuvent être anxieuses', 'upload_date': '20140902', }, }, { 'url': 'http://future.arte.tv/fr/la-science-est-elle-responsable', 'only_matching': True, }] class ArteTVDDCIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:ddc' _VALID_URL = r'https?://ddc\.arte\.tv/(?Pemission|folge)/(?P[^/?#&]+)' def _real_extract(self, url): video_id, lang = self._extract_url_info(url) if lang == 'folge': lang = 'de' elif lang == 'emission': lang = 'fr' webpage = self._download_webpage(url, video_id) scriptElement = get_element_by_attribute('class', 'visu_video_block', webpage) script_url = self._html_search_regex(r'src="(.*?)"', scriptElement, 'script url') javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator') json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url') return self._extract_from_json_url(json_url, video_id, lang) class ArteTVConcertIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:concert' _VALID_URL = r'https?://concert\.arte\.tv/(?Pfr|de|en|es)/(?P[^/?#&]+)' _TEST = { 'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', 'md5': '9ea035b7bd69696b67aa2ccaaa218161', 'info_dict': { 'id': '186', 'ext': 'mp4', 'title': 'The Notwist im Pariser Konzertclub "Divan du Monde"', 'upload_date': '20140128', 'description': 'md5:486eb08f991552ade77439fe6d82c305', }, } class ArteTVCinemaIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:cinema' _VALID_URL = r'https?://cinema\.arte\.tv/(?Pfr|de|en|es)/(?P.+)' _TEST = { 'url': 'http://cinema.arte.tv/de/node/38291', 'md5': '6b275511a5107c60bacbeeda368c3aa1', 'info_dict': { 'id': '055876-000_PWA12025-D', 'ext': 'mp4', 'title': 'Tod auf dem Nil', 'upload_date': '20160122', 'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', }, } class ArteTVMagazineIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:magazine' _VALID_URL = r'https?://(?:www\.)?arte\.tv/magazine/[^/]+/(?Pfr|de|en|es)/(?P[^/?#&]+)' _TESTS = [{ # Embedded via ', webpage) if mobj: embedded_url = mobj.group(1) return self.url_result(embedded_url) video_title = self._html_search_regex( r'

    ]*>([^<]+)', webpage, 'title') flashvars = self._parse_json(self._search_regex( r'var\s+flashvars\s*=\s*([^;]+);', webpage, 'flashvars'), video_id) formats = [] for height in (180, 240, 480): if flashvars.get('quality_%dp' % height): video_url = flashvars['quality_%dp' % height] a_format = { 'url': video_url, 'height': height, 'format_id': '%dp' % height, } filename_parts = url_basename(video_url).split('_') if len(filename_parts) >= 2 and re.match(r'\d+[Kk]', filename_parts[1]): a_format['tbr'] = int(filename_parts[1][:-1]) formats.append(a_format) age_limit = self._rta_search(webpage) return { 'id': video_id, 'title': video_title, 'formats': formats, 'age_limit': age_limit, 'thumbnail': flashvars.get('image_url') } youtube-dl/youtube_dl/extractor/ard.py0000644000000000000000000003036112641030331017147 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, get_element_by_attribute, qualities, int_or_none, parse_duration, unified_strdate, xpath_text, ) from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): IE_NAME = 'ARD:mediathek' _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114', 'info_dict': { 'id': '29582122', 'ext': 'mp4', 'title': 'Ich liebe das Leben trotzdem', 'description': 'md5:45e4c225c72b27993314b31a84a5261c', 'duration': 4557, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', 'info_dict': { 'id': '29522730', 'ext': 'mp4', 'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)', 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', 'duration': 5252, }, }, { # audio 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', 'md5': '219d94d8980b4f538c7fcb0865eb7f2c', 'info_dict': { 'id': '28488308', 'ext': 'mp3', 'title': 'Tod eines Fußballers', 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', 'duration': 3240, }, }, { 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', 'only_matching': True, }] def _extract_media_info(self, media_info_url, webpage, video_id): media_info = self._download_json( media_info_url, video_id, 'Downloading media JSON') formats = self._extract_formats(media_info, video_id) if not formats: if '"fsk"' in webpage: raise ExtractorError( 'This video is only available after 20:00', expected=True) elif media_info.get('_geoblocked'): raise ExtractorError('This video is not available due to geo restriction', expected=True) self._sort_formats(formats) duration = int_or_none(media_info.get('_duration')) thumbnail = media_info.get('_previewImage') subtitles = {} subtitle_url = media_info.get('_subtitleUrl') if subtitle_url: subtitles['de'] = [{ 'ext': 'srt', 'url': subtitle_url, }] return { 'id': video_id, 'duration': duration, 'thumbnail': thumbnail, 'formats': formats, 'subtitles': subtitles, } def _extract_formats(self, media_info, video_id): type_ = media_info.get('_type') media_array = media_info.get('_mediaArray', []) formats = [] for num, media in enumerate(media_array): for stream in media.get('_mediaStreamArray', []): stream_urls = stream.get('_stream') if not stream_urls: continue if not isinstance(stream_urls, list): stream_urls = [stream_urls] quality = stream.get('_quality') server = stream.get('_server') for stream_url in stream_urls: ext = determine_ext(stream_url) if quality != 'auto' and ext in ('f4m', 'm3u8'): continue if ext == 'f4m': formats.extend(self._extract_f4m_formats( stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, preference=-1, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False)) else: if server and server.startswith('rtmp'): f = { 'url': server, 'play_path': stream_url, 'format_id': 'a%s-rtmp-%s' % (num, quality), } elif stream_url.startswith('http'): f = { 'url': stream_url, 'format_id': 'a%s-%s-%s' % (num, ext, quality) } else: continue m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) if m: f.update({ 'width': int(m.group('width')), 'height': int(m.group('height')), }) if type_ == 'audio': f['vcodec'] = 'none' formats.append(f) return formats def _real_extract(self, url): # determine video id from url m = re.match(self._VALID_URL, url) numid = re.search(r'documentId=([0-9]+)', url) if numid: video_id = numid.group(1) else: video_id = m.group('video_id') webpage = self._download_webpage(url, video_id) if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage: raise ExtractorError('Video %s is no longer available' % video_id, expected=True) if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage: raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True) if re.search(r'[\?&]rss($|[=&])', url): doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) title = self._html_search_regex( [r'(.*?)

    ', r'', r'

    (.*?)

    '], webpage, 'title') description = self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( 'description', webpage, 'meta description') # Thumbnail is sometimes not present. # It is in the mobile version, but that seems to use a different URL # structure altogether. thumbnail = self._og_search_thumbnail(webpage, default=None) media_streams = re.findall(r'''(?x) mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s* "([^"]+)"''', webpage) if media_streams: QUALITIES = qualities(['lo', 'hi', 'hq']) formats = [] for furl in set(media_streams): if furl.endswith('.f4m'): fid = 'f4m' else: fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl) fid = fid_m.group(1) if fid_m else None formats.append({ 'quality': QUALITIES(fid), 'format_id': fid, 'url': furl, }) self._sort_formats(formats) info = { 'formats': formats, } else: # request JSON file info = self._extract_media_info( 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id) info.update({ 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, }) return info class ARDIE(InfoExtractor): _VALID_URL = '(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' _TEST = { 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', 'md5': 'd216c3a86493f9322545e045ddc3eb35', 'info_dict': { 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge', 'id': '100', 'ext': 'mp4', 'duration': 2600, 'title': 'Die Story im Ersten: Mission unter falscher Flagge', 'upload_date': '20140804', 'thumbnail': 're:^https?://.*\.jpg$', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('display_id') player_url = mobj.group('mainurl') + '~playerXml.xml' doc = self._download_xml(player_url, display_id) video_node = doc.find('./video') upload_date = unified_strdate(xpath_text( video_node, './broadcastDate')) thumbnail = xpath_text(video_node, './/teaserImage//variant/url') formats = [] for a in video_node.findall('.//asset'): f = { 'format_id': a.attrib['type'], 'width': int_or_none(a.find('./frameWidth').text), 'height': int_or_none(a.find('./frameHeight').text), 'vbr': int_or_none(a.find('./bitrateVideo').text), 'abr': int_or_none(a.find('./bitrateAudio').text), 'vcodec': a.find('./codecVideo').text, 'tbr': int_or_none(a.find('./totalBitrate').text), } if a.find('./serverPrefix').text: f['url'] = a.find('./serverPrefix').text f['playpath'] = a.find('./fileName').text else: f['url'] = a.find('./fileName').text formats.append(f) self._sort_formats(formats) return { 'id': mobj.group('id'), 'formats': formats, 'display_id': display_id, 'title': video_node.find('./title').text, 'duration': parse_duration(video_node.find('./duration').text), 'upload_date': upload_date, 'thumbnail': thumbnail, } class SportschauIE(ARDMediathekIE): IE_NAME = 'Sportschau' _VALID_URL = r'(?Phttps?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P[^/#?]+))\.html' _TESTS = [{ 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', 'info_dict': { 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', 'ext': 'mp4', 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', }, 'params': { # m3u8 download 'skip_download': True, }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') base_url = mobj.group('baseurl') webpage = self._download_webpage(url, video_id) title = get_element_by_attribute('class', 'headline', webpage) description = self._html_search_meta('description', webpage, 'description') info = self._extract_media_info( base_url + '-mc_defaultQuality-h.json', webpage, video_id) info.update({ 'title': title, 'description': description, }) return info youtube-dl/youtube_dl/extractor/videofyme.py0000644000000000000000000000332612641030331020371 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, ) class VideofyMeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.videofy\.me/.+?|p\.videofy\.me/v)/(?P\d+)(&|#|$)' IE_NAME = 'videofy.me' _TEST = { 'url': 'http://www.videofy.me/thisisvideofyme/1100701', 'md5': 'c77d700bdc16ae2e9f3c26019bd96143', 'info_dict': { 'id': '1100701', 'ext': 'mp4', 'title': 'This is VideofyMe', 'description': '', 'upload_date': '20130326', 'timestamp': 1364288959, 'uploader': 'VideofyMe', 'uploader_id': 'thisisvideofyme', 'view_count': int, 'likes': int, 'comment_count': int, }, } def _real_extract(self, url): video_id = self._match_id(url) config = self._download_json('http://vf-player-info-loader.herokuapp.com/%s.json' % video_id, video_id)['videoinfo'] video = config.get('video') blog = config.get('blog', {}) return { 'id': video_id, 'title': video['title'], 'url': video['sources']['source']['url'], 'thumbnail': video.get('thumb'), 'description': video.get('description'), 'timestamp': parse_iso8601(video.get('date')), 'uploader': blog.get('name'), 'uploader_id': blog.get('identifier'), 'view_count': int_or_none(self._search_regex(r'([0-9]+)', video.get('views'), 'view count', fatal=False)), 'likes': int_or_none(video.get('likes')), 'comment_count': int_or_none(video.get('nrOfComments')), } youtube-dl/youtube_dl/extractor/aparat.py0000644000000000000000000000371212641030331017651 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, HEADRequest, ) class AparatIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' _TEST = { 'url': 'http://www.aparat.com/v/wP8On', 'md5': '6714e0af7e0d875c5a39c4dc4ab46ad1', 'info_dict': { 'id': 'wP8On', 'ext': 'mp4', 'title': 'تیم گلکسی 11 - زومیت', 'age_limit': 0, }, # 'skip': 'Extremely unreliable', } def _real_extract(self, url): video_id = self._match_id(url) # Note: There is an easier-to-parse configuration at # http://www.aparat.com/video/video/config/videohash/%video_id # but the URL in there does not work embed_url = ('http://www.aparat.com/video/video/embed/videohash/' + video_id + '/vt/frame') webpage = self._download_webpage(embed_url, video_id) video_urls = [video_url.replace('\\/', '/') for video_url in re.findall( r'(?:fileList\[[0-9]+\]\s*=|"file"\s*:)\s*"([^"]+)"', webpage)] for i, video_url in enumerate(video_urls): req = HEADRequest(video_url) res = self._request_webpage( req, video_id, note='Testing video URL %d' % i, errnote=False) if res: break else: raise ExtractorError('No working video URLs found') title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title') thumbnail = self._search_regex( r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) return { 'id': video_id, 'title': title, 'url': video_url, 'ext': 'mp4', 'thumbnail': thumbnail, 'age_limit': self._family_friendly_search(webpage), } youtube-dl/youtube_dl/extractor/gamersyde.py0000644000000000000000000000423112641030331020356 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( js_to_json, parse_duration, remove_start, ) class GamersydeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_(?P[\da-z_]+)-(?P\d+)_[a-z]{2}\.html' _TEST = { 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html', 'md5': 'f38d400d32f19724570040d5ce3a505f', 'info_dict': { 'id': '34371', 'ext': 'mp4', 'duration': 372, 'title': 'Bloodborne - Birth of a hero', 'thumbnail': 're:^https?://.*\.jpg$', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) playlist = self._parse_json( self._search_regex( r'(?s)playlist: \[({.+?})\]\s*}\);', webpage, 'files'), display_id, transform_source=js_to_json) formats = [] for source in playlist['sources']: video_url = source.get('file') if not video_url: continue format_id = source.get('label') f = { 'url': video_url, 'format_id': format_id, } m = re.search(r'^(?P\d+)[pP](?P\d+)fps', format_id) if m: f.update({ 'height': int(m.group('height')), 'fps': int(m.group('fps')), }) formats.append(f) self._sort_formats(formats) title = remove_start(playlist['title'], '%s - ' % video_id) thumbnail = playlist.get('image') duration = parse_duration(self._search_regex( r'Length:([^<]+)<', webpage, 'duration', fatal=False)) return { 'id': video_id, 'display_id': display_id, 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, } youtube-dl/youtube_dl/extractor/condenast.py0000644000000000000000000001414312641030331020357 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( orderedSet, remove_end, ) class CondeNastIE(InfoExtractor): """ Condé Nast is a media group, some of its sites use a custom HTML5 player that works the same in all of them. """ # The keys are the supported sites and the values are the name to be shown # to the user and in the extractor description. _SITES = { 'allure': 'Allure', 'architecturaldigest': 'Architectural Digest', 'arstechnica': 'Ars Technica', 'bonappetit': 'Bon Appétit', 'brides': 'Brides', 'cnevids': 'Condé Nast', 'cntraveler': 'Condé Nast Traveler', 'details': 'Details', 'epicurious': 'Epicurious', 'glamour': 'Glamour', 'golfdigest': 'Golf Digest', 'gq': 'GQ', 'newyorker': 'The New Yorker', 'self': 'SELF', 'teenvogue': 'Teen Vogue', 'vanityfair': 'Vanity Fair', 'vogue': 'Vogue', 'wired': 'WIRED', 'wmagazine': 'W Magazine', } _VALID_URL = r'http://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed(?:js)?)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) EMBED_URL = r'(?:https?:)?//player\.(?P%s)\.com/(?Pembed(?:js)?)/.+?' % '|'.join(_SITES.keys()) _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', 'md5': '1921f713ed48aabd715691f774c451f7', 'info_dict': { 'id': '5171b343c2b4c00dd0c1ccb3', 'ext': 'mp4', 'title': '3D Printed Speakers Lit With LED', 'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', } }, { # JS embed 'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js', 'md5': 'f1a6f9cafb7083bab74a710f65d08999', 'info_dict': { 'id': '55f9cf8b61646d1acf00000c', 'ext': 'mp4', 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', } }] def _extract_series(self, url, webpage): title = self._html_search_regex(r'
    .*?

    (.+?)

    ', webpage, 'series title', flags=re.DOTALL) url_object = compat_urllib_parse_urlparse(url) base_url = '%s://%s' % (url_object.scheme, url_object.netloc) m_paths = re.finditer(r'

    .*?(.+?)

    ', r'
    (.+?)
    ', ], webpage, 'description', fatal=False, flags=re.DOTALL) else: description = None params = self._search_regex(r'var params = {(.+?)}[;,]', webpage, 'player params', flags=re.DOTALL) video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id') player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id') target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target') data = compat_urllib_parse.urlencode({'videoId': video_id, 'playerId': player_id, 'target': target, }) base_info_url = self._search_regex(r'url = [\'"](.+?)[\'"][,;]', webpage, 'base info url', default='http://player.cnevids.com/player/loader.js?') info_url = base_info_url + data info_page = self._download_webpage(info_url, video_id, 'Downloading video info') video_info = self._search_regex(r'var\s+video\s*=\s*({.+?});', info_page, 'video info') video_info = self._parse_json(video_info, video_id) formats = [{ 'format_id': '%s-%s' % (fdata['type'].split('/')[-1], fdata['quality']), 'url': fdata['src'], 'ext': fdata['type'].split('/')[-1], 'quality': 1 if fdata['quality'] == 'high' else 0, } for fdata in video_info['sources'][0]] self._sort_formats(formats) return { 'id': video_id, 'formats': formats, 'title': video_info['title'], 'thumbnail': video_info['poster_frame'], 'description': description, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) site = mobj.group('site') url_type = mobj.group('type') item_id = mobj.group('id') # Convert JS embed to regular embed if url_type == 'embedjs': parsed_url = compat_urlparse.urlparse(url) url = compat_urlparse.urlunparse(parsed_url._replace( path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/'))) url_type = 'embed' self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site]) webpage = self._download_webpage(url, item_id) if url_type == 'series': return self._extract_series(url, webpage) else: return self._extract_video(webpage, url_type) youtube-dl/youtube_dl/extractor/dctp.py0000644000000000000000000000405712641030331017336 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str class DctpTvIE(InfoExtractor): _VALID_URL = r'http://www.dctp.tv/(#/)?filme/(?P.+?)/$' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { 'id': '1324', 'display_id': 'videoinstallation-fuer-eine-kaufhausfassade', 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade' }, 'params': { # rtmp download 'skip_download': True, } } def _real_extract(self, url): video_id = self._match_id(url) base_url = 'http://dctp-ivms2-restapi.s3.amazonaws.com/' version_json = self._download_json( base_url + 'version.json', video_id, note='Determining file version') version = version_json['version_name'] info_json = self._download_json( '{0}{1}/restapi/slugs/{2}.json'.format(base_url, version, video_id), video_id, note='Fetching object ID') object_id = compat_str(info_json['object_id']) meta_json = self._download_json( '{0}{1}/restapi/media/{2}.json'.format(base_url, version, object_id), video_id, note='Downloading metadata') uuid = meta_json['uuid'] title = meta_json['title'] wide = meta_json['is_wide'] if wide: ratio = '16x9' else: ratio = '4x3' play_path = 'mp4:{0}_dctp_0500_{1}.m4v'.format(uuid, ratio) servers_json = self._download_json( 'http://www.dctp.tv/streaming_servers/', video_id, note='Downloading server list') url = servers_json[0]['endpoint'] return { 'id': object_id, 'title': title, 'format': 'rtmp', 'url': url, 'play_path': play_path, 'rtmp_real_time': True, 'ext': 'flv', 'display_id': video_id } youtube-dl/youtube_dl/extractor/rtl2.py0000644000000000000000000000566112660177411017305 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class RTL2IE(InfoExtractor): _VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P[^?#/]*?)(?:$|/(?:$|[?#]))' _TESTS = [{ 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', 'info_dict': { 'id': 'folge-203-0', 'ext': 'f4v', 'title': 'GRIP sucht den Sommerkönig', 'description': 'Matthias, Det und Helge treten gegeneinander an.' }, 'params': { # rtmp download 'skip_download': True, }, }, { 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', 'info_dict': { 'id': '21040-anna-erwischt-alex', 'ext': 'mp4', 'title': 'Anna erwischt Alex!', 'description': 'Anna ist Alex\' Tochter bei Köln 50667.' }, 'params': { # rtmp download 'skip_download': True, }, }] def _real_extract(self, url): # Some rtl2 urls have no slash at the end, so append it. if not url.endswith('/'): url += '/' video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) mobj = re.search( r']+data-collection="(?P\d+)"[^>]+data-video="(?P\d+)"', webpage) if mobj: vico_id = mobj.group('vico_id') vivi_id = mobj.group('vivi_id') else: vico_id = self._html_search_regex( r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') vivi_id = self._html_search_regex( r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') info_url = 'http://www.rtl2.de/video/php/get_video.php?vico_id=' + vico_id + '&vivi_id=' + vivi_id info = self._download_json(info_url, video_id) video_info = info['video'] title = video_info['titel'] description = video_info.get('beschreibung') thumbnail = video_info.get('image') download_url = video_info['streamurl'] download_url = download_url.replace('\\', '') stream_url = 'mp4:' + self._html_search_regex(r'ondemand/(.*)', download_url, 'stream URL') rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] formats = [{ 'url': download_url, 'play_path': stream_url, 'player_url': 'http://www.rtl2.de/flashplayer/vipo_player.swf', 'page_url': url, 'flash_version': 'LNX 11,2,202,429', 'rtmp_conn': rtmp_conn, 'no_resume': True, }] self._sort_formats(formats) return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'description': description, 'formats': formats, } youtube-dl/youtube_dl/extractor/lynda.py0000644000000000000000000002150012641030331017503 0ustar rootrootfrom __future__ import unicode_literals import re import json from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, ) from ..utils import ( ExtractorError, clean_html, int_or_none, sanitized_Request, ) class LyndaBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' _NETRC_MACHINE = 'lynda' def _real_initialize(self): self._login() def _login(self): username, password = self._get_login_info() if username is None: return login_form = { 'username': username.encode('utf-8'), 'password': password.encode('utf-8'), 'remember': 'false', 'stayPut': 'false' } request = sanitized_Request( self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) login_page = self._download_webpage( request, None, 'Logging in as %s' % username) # Not (yet) logged in m = re.search(r'loginResultJson\s*=\s*\'(?P[^\']+)\';', login_page) if m is not None: response = m.group('json') response_json = json.loads(response) state = response_json['state'] if state == 'notlogged': raise ExtractorError( 'Unable to login, incorrect username and/or password', expected=True) # This is when we get popup: # > You're already logged in to lynda.com on two devices. # > If you log in here, we'll log you out of another device. # So, we need to confirm this. if state == 'conflicted': confirm_form = { 'username': '', 'password': '', 'resolve': 'true', 'remember': 'false', 'stayPut': 'false', } request = sanitized_Request( self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form).encode('utf-8')) login_page = self._download_webpage( request, None, 'Confirming log in and log out from another device') if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')): if 'login error' in login_page: mobj = re.search( r'(?s)]+class="topmost">(?P[^<]+)</h1>\s*<div>(?P<description>.+?)</div>', login_page) if mobj: raise ExtractorError( 'lynda returned error: %s - %s' % (mobj.group('title'), clean_html(mobj.group('description'))), expected=True) raise ExtractorError('Unable to log in') def _logout(self): username, _ = self._get_login_info() if username is None: return self._download_webpage( 'http://www.lynda.com/ajax/logout.aspx', None, 'Logging out', 'Unable to log out', fatal=False) class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' _VALID_URL = r'https?://www\.lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(?P<id>\d+)' _NETRC_MACHINE = 'lynda' _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]' _TESTS = [{ 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1', 'info_dict': { 'id': '114408', 'ext': 'mp4', 'title': 'Using the exercise files', 'duration': 68 } }, { 'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( 'http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id, 'Downloading video JSON') if 'Status' in video: raise ExtractorError( 'lynda returned error: %s' % video['Message'], expected=True) if video.get('HasAccess') is False: self.raise_login_required('Video %s is only available for members' % video_id) video_id = compat_str(video.get('ID') or video_id) duration = int_or_none(video.get('DurationInSeconds')) title = video['Title'] formats = [] fmts = video.get('Formats') if fmts: formats.extend([{ 'url': f['Url'], 'ext': f.get('Extension'), 'width': int_or_none(f.get('Width')), 'height': int_or_none(f.get('Height')), 'filesize': int_or_none(f.get('FileSize')), 'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None, } for f in fmts if f.get('Url')]) prioritized_streams = video.get('PrioritizedStreams') if prioritized_streams: for prioritized_stream_id, prioritized_stream in prioritized_streams.items(): formats.extend([{ 'url': video_url, 'width': int_or_none(format_id), 'format_id': '%s-%s' % (prioritized_stream_id, format_id), } for format_id, video_url in prioritized_stream.items()]) self._check_formats(formats, video_id) self._sort_formats(formats) subtitles = self.extract_subtitles(video_id) return { 'id': video_id, 'title': title, 'duration': duration, 'subtitles': subtitles, 'formats': formats } def _fix_subtitles(self, subs): srt = '' seq_counter = 0 for pos in range(0, len(subs) - 1): seq_current = subs[pos] m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) if m_current is None: continue seq_next = subs[pos + 1] m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) if m_next is None: continue appear_time = m_current.group('timecode') disappear_time = m_next.group('timecode') text = seq_current['Caption'].strip() if text: seq_counter += 1 srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text) if srt: return srt def _get_subtitles(self, video_id): url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id subs = self._download_json(url, None, False) if subs: return {'en': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} else: return {} class LyndaCourseIE(LyndaBaseIE): IE_NAME = 'lynda:course' IE_DESC = 'lynda.com online courses' # Course link equals to welcome/introduction video link of same course # We will recognize it as course link _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P<coursepath>[^/]+/[^/]+/(?P<courseid>\d+))-\d\.html' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) course_path = mobj.group('coursepath') course_id = mobj.group('courseid') course = self._download_json( 'http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, course_id, 'Downloading course JSON') self._logout() if course.get('Status') == 'NotFound': raise ExtractorError( 'Course %s does not exist' % course_id, expected=True) unaccessible_videos = 0 videos = [] # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided # by single video API anymore for chapter in course['Chapters']: for video in chapter.get('Videos', []): if video.get('HasAccess') is False: unaccessible_videos += 1 continue if video.get('ID'): videos.append(video['ID']) if unaccessible_videos > 0: self._downloader.report_warning( '%s videos are only available for members (or paid members) and will not be downloaded. ' % unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT) entries = [ self.url_result( 'http://www.lynda.com/%s/%s-4.html' % (course_path, video_id), 'Lynda') for video_id in videos] course_title = course.get('Title') return self.playlist_result(entries, course_id, course_title) ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/ellentv.py����������������������������������������������������������0000644�0000000�0000000�00000005224�12660177411�020066� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import json from .common import InfoExtractor from ..utils import ( ExtractorError, ) class EllenTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)' _TEST = { 'url': 'http://www.ellentv.com/videos/0-ipq1gsai/', 'md5': '4294cf98bc165f218aaa0b89e0fd8042', 'info_dict': { 'id': '0_ipq1gsai', 'ext': 'mov', 'title': 'Fast Fingers of Fate', 'description': 'md5:3539013ddcbfa64b2a6d1b38d910868a', 'timestamp': 1428035648, 'upload_date': '20150403', 'uploader_id': 'batchUser', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://widgets.ellentube.com/videos/%s' % video_id, video_id) partner_id = self._search_regex( r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id') kaltura_id = self._search_regex( [r'id="kaltura_player_([^"]+)"', r"_wb_entry_id\s*:\s*'([^']+)", r'data-kaltura-entry-id="([^"]+)'], webpage, 'kaltura id') return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura') class EllenTVClipsIE(InfoExtractor): IE_NAME = 'EllenTV:clips' _VALID_URL = r'https?://(?:www\.)?ellentv\.com/episodes/(?P<id>[a-z0-9_-]+)' _TEST = { 'url': 'http://www.ellentv.com/episodes/meryl-streep-vanessa-hudgens/', 'info_dict': { 'id': 'meryl-streep-vanessa-hudgens', 'title': 'Meryl Streep, Vanessa Hudgens', }, 'playlist_mincount': 7, } def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) playlist = self._extract_playlist(webpage) return { '_type': 'playlist', 'id': playlist_id, 'title': self._og_search_title(webpage), 'entries': self._extract_entries(playlist) } def _extract_playlist(self, webpage): json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json') try: return json.loads('[{' + json_string + '}]') except ValueError as ve: raise ExtractorError('Failed to download JSON', cause=ve) def _extract_entries(self, playlist): return [ self.url_result( 'kaltura:%s:%s' % (item['kaltura_partner_id'], item['kaltura_entry_id']), 'Kaltura') for item in playlist] ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/redtube.py����������������������������������������������������������0000644�0000000�0000000�00000002656�12641030331�020041� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ExtractorError class RedTubeIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.redtube.com/66418', 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', 'info_dict': { 'id': '66418', 'ext': 'mp4', 'title': 'Sucked on a toilet', 'age_limit': 18, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): raise ExtractorError('Video %s has been removed' % video_id, expected=True) video_url = self._html_search_regex( r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') video_title = self._html_search_regex( r'<h1 class="videoTitle[^"]*">(.+?)</h1>', webpage, 'title') video_thumbnail = self._og_search_thumbnail(webpage) # No self-labeling, but they describe themselves as # "Home of Videos Porno" age_limit = 18 return { 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': video_title, 'thumbnail': video_thumbnail, 'age_limit': age_limit, } ����������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/naver.py������������������������������������������������������������0000644�0000000�0000000�00000007254�12641030331�017521� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urlparse, ) from ..utils import ( ExtractorError, ) class NaverIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)' _TESTS = [{ 'url': 'http://tvcast.naver.com/v/81652', 'info_dict': { 'id': '81652', 'ext': 'mp4', 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', 'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', 'upload_date': '20130903', }, }, { 'url': 'http://tvcast.naver.com/v/395837', 'md5': '638ed4c12012c458fefcddfd01f173cd', 'info_dict': { 'id': '395837', 'ext': 'mp4', 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', 'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7', 'upload_date': '20150519', }, 'skip': 'Georestricted', }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"', webpage) if m_id is None: error = self._html_search_regex( r'(?s)<div class="(?:nation_error|nation_box|error_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>', webpage, 'error', default=None) if error: raise ExtractorError(error, expected=True) raise ExtractorError('couldn\'t extract vid and key') vid = m_id.group(1) key = m_id.group(2) query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key, }) query_urls = compat_urllib_parse.urlencode({ 'masterVid': vid, 'protocol': 'p2p', 'inKey': key, }) info = self._download_xml( 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, video_id, 'Downloading video info') urls = self._download_xml( 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, video_id, 'Downloading video formats info') formats = [] for format_el in urls.findall('EncodingOptions/EncodingOption'): domain = format_el.find('Domain').text uri = format_el.find('uri').text f = { 'url': compat_urlparse.urljoin(domain, uri), 'ext': 'mp4', 'width': int(format_el.find('width').text), 'height': int(format_el.find('height').text), } if domain.startswith('rtmp'): # urlparse does not support custom schemes # https://bugs.python.org/issue18828 f.update({ 'url': domain + uri, 'ext': 'flv', 'rtmp_protocol': '1', # rtmpt }) formats.append(f) self._sort_formats(formats) return { 'id': video_id, 'title': info.find('Subject').text, 'formats': formats, 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'upload_date': info.find('WriteDate').text.replace('.', ''), 'view_count': int(info.find('PlayCount').text), } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/franceculture.py����������������������������������������������������0000644�0000000�0000000�00000010144�12641030331�021240� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_urlparse, ) from ..utils import ( determine_ext, int_or_none, ExtractorError, ) class FranceCultureIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/player/reecouter\?play=(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.franceculture.fr/player/reecouter?play=4795174', 'info_dict': { 'id': '4795174', 'ext': 'mp3', 'title': 'Rendez-vous au pays des geeks', 'alt_title': 'Carnet nomade | 13-14', 'vcodec': 'none', 'upload_date': '20140301', 'thumbnail': r're:^http://static\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$', 'description': 'startswith:Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche', 'timestamp': 1393700400, } } def _extract_from_player(self, url, video_id): webpage = self._download_webpage(url, video_id) video_path = self._search_regex( r'<a id="player".*?href="([^"]+)"', webpage, 'video path') video_url = compat_urlparse.urljoin(url, video_path) timestamp = int_or_none(self._search_regex( r'<a id="player".*?data-date="([0-9]+)"', webpage, 'upload date', fatal=False)) thumbnail = self._search_regex( r'<a id="player".*?>\s+<img src="([^"]+)"', webpage, 'thumbnail', fatal=False) display_id = self._search_regex( r'<span class="path-diffusion">emission-(.*?)</span>', webpage, 'display_id') title = self._html_search_regex( r'<span class="title-diffusion">(.*?)</span>', webpage, 'title') alt_title = self._html_search_regex( r'<span class="title">(.*?)</span>', webpage, 'alt_title', fatal=False) description = self._html_search_regex( r'<span class="description">(.*?)</span>', webpage, 'description', fatal=False) uploader = self._html_search_regex( r'(?s)<div id="emission".*?<span class="author">(.*?)</span>', webpage, 'uploader', default=None) vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None return { 'id': video_id, 'url': video_url, 'vcodec': vcodec, 'uploader': uploader, 'timestamp': timestamp, 'title': title, 'alt_title': alt_title, 'thumbnail': thumbnail, 'description': description, 'display_id': display_id, } def _real_extract(self, url): video_id = self._match_id(url) return self._extract_from_player(url, video_id) class FranceCultureEmissionIE(FranceCultureIE): _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emission-(?P<id>[^?#]+)' _TEST = { 'url': 'http://www.franceculture.fr/emission-les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13', 'info_dict': { 'title': 'Jean-Gabriel Périot, cinéaste', 'alt_title': 'Les Carnets de la création', 'id': '5093239', 'display_id': 'les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13', 'ext': 'mp3', 'timestamp': 1444762500, 'upload_date': '20151013', 'description': 'startswith:Aujourd\'hui dans "Les carnets de la création", le cinéaste', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_path = self._html_search_regex( r'<a class="rf-player-open".*?href="([^"]+)"', webpage, 'video path', 'no_path_player') if video_path == 'no_path_player': raise ExtractorError('no player : no sound in this page.', expected=True) new_id = self._search_regex('play=(?P<id>[0-9]+)', video_path, 'new_id', group='id') video_url = compat_urlparse.urljoin(url, video_path) return self._extract_from_player(video_url, new_id) ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/cbc.py��������������������������������������������������������������0000644�0000000�0000000�00000010537�12657443441�017154� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import js_to_json class CBCIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ # with mediaId 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', 'info_dict': { 'id': '2682904050', 'ext': 'flv', 'title': 'Don Cherry – All-Stars', 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.', 'timestamp': 1454475540, 'upload_date': '20160203', }, 'params': { # rtmp download 'skip_download': True, }, }, { # with clipId 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', 'info_dict': { 'id': '2487345465', 'ext': 'flv', 'title': 'Robin Williams freestyles on 90 Minutes Live', 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', 'upload_date': '19700101', }, 'params': { # rtmp download 'skip_download': True, }, }, { # multiple iframes 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', 'playlist': [{ 'info_dict': { 'id': '2680832926', 'ext': 'flv', 'title': 'An Eagle\'s-Eye View Off Burrard Bridge', 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.', 'upload_date': '19700101', }, }, { 'info_dict': { 'id': '2658915080', 'ext': 'flv', 'title': 'Fly like an eagle!', 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower', 'upload_date': '19700101', }, }], 'params': { # rtmp download 'skip_download': True, }, }] @classmethod def suitable(cls, url): return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url) def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) player_init = self._search_regex( r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage, 'player init', default=None) if player_init: player_info = self._parse_json(player_init, display_id, js_to_json) media_id = player_info.get('mediaId') if not media_id: clip_id = player_info['clipId'] media_id = self._download_json( 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, clip_id)['entries'][0]['id'].split('/')[-1] return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) else: entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)] return self.playlist_result(entries) class CBCPlayerIE(InfoExtractor): _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)' _TEST = { 'url': 'http://www.cbc.ca/player/play/2683190193', 'info_dict': { 'id': '2683190193', 'ext': 'flv', 'title': 'Gerry Runs a Sweat Shop', 'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0', 'timestamp': 1455067800, 'upload_date': '20160210', }, 'params': { # rtmp download 'skip_download': True, }, } def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( 'http://feed.theplatform.com/f/ExhSPC/vms_5akSXx4Ng_Zn?byGuid=%s' % video_id, 'ThePlatformFeed', video_id) �����������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/dreisat.py����������������������������������������������������������0000644�0000000�0000000�00000002214�12644050477�020050� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .zdf import ZDFIE class DreiSatIE(ZDFIE): IE_NAME = '3sat' _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' _TESTS = [ { 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', 'md5': 'be37228896d30a88f315b638900a026e', 'info_dict': { 'id': '45918', 'ext': 'mp4', 'title': 'Waidmannsheil', 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', 'uploader': '3sat', 'upload_date': '20140913' } }, { 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', 'only_matching': True, }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id return self.extract_from_xml_url(video_id, details_url) ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/vessel.py�����������������������������������������������������������0000644�0000000�0000000�00000011241�12641030331�017676� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import json from .common import InfoExtractor from ..utils import ( ExtractorError, parse_iso8601, sanitized_Request, ) class VesselIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)' _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s' _LOGIN_URL = 'https://www.vessel.com/api/account/login' _NETRC_MACHINE = 'vessel' _TEST = { 'url': 'https://www.vessel.com/videos/HDN7G5UMs', 'md5': '455cdf8beb71c6dd797fd2f3818d05c4', 'info_dict': { 'id': 'HDN7G5UMs', 'ext': 'mp4', 'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?', 'thumbnail': 're:^https?://.*\.jpg$', 'upload_date': '20150317', 'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?', 'timestamp': int, }, } @staticmethod def make_json_request(url, data): payload = json.dumps(data).encode('utf-8') req = sanitized_Request(url, payload) req.add_header('Content-Type', 'application/json; charset=utf-8') return req @staticmethod def find_assets(data, asset_type, asset_id=None): for asset in data.get('assets', []): if not asset.get('type') == asset_type: continue elif asset_id is not None and not asset.get('id') == asset_id: continue else: yield asset def _check_access_rights(self, data): access_info = data.get('__view', {}) if not access_info.get('allow_access', True): err_code = access_info.get('error_code') or '' if err_code == 'ITEM_PAID_ONLY': raise ExtractorError( 'This video requires subscription.', expected=True) else: raise ExtractorError( 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) def _login(self): (username, password) = self._get_login_info() if username is None: return self.report_login() data = { 'client_id': 'web', 'type': 'password', 'user_key': username, 'password': password, } login_request = VesselIE.make_json_request(self._LOGIN_URL, data) self._download_webpage(login_request, None, False, 'Wrong login info') def _real_initialize(self): self._login() def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) data = self._parse_json(self._search_regex( r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id) asset_id = data['model']['data']['id'] req = VesselIE.make_json_request( self._API_URL_TEMPLATE % asset_id, {'client': 'web'}) data = self._download_json(req, video_id) video_asset_id = data.get('main_video_asset') self._check_access_rights(data) try: video_asset = next( VesselIE.find_assets(data, 'video', asset_id=video_asset_id)) except StopIteration: raise ExtractorError('No video assets found') formats = [] for f in video_asset.get('sources', []): if f['name'] == 'hls-index': formats.extend(self._extract_m3u8_formats( f['location'], video_id, ext='mp4', m3u8_id='m3u8')) else: formats.append({ 'format_id': f['name'], 'tbr': f.get('bitrate'), 'height': f.get('height'), 'width': f.get('width'), 'url': f['location'], }) self._sort_formats(formats) thumbnails = [] for im_asset in VesselIE.find_assets(data, 'image'): thumbnails.append({ 'url': im_asset['location'], 'width': im_asset.get('width', 0), 'height': im_asset.get('height', 0), }) return { 'id': video_id, 'title': data['title'], 'formats': formats, 'thumbnails': thumbnails, 'description': data.get('short_description'), 'duration': data.get('duration'), 'comment_count': data.get('comment_count'), 'like_count': data.get('like_count'), 'view_count': data.get('view_count'), 'timestamp': parse_iso8601(data.get('released_at')), } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/dumpert.py����������������������������������������������������������0000644�0000000�0000000�00000004270�12641030331�020061� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import base64 import re from .common import InfoExtractor from ..utils import ( qualities, sanitized_Request, ) class DumpertIE(InfoExtractor): _VALID_URL = r'(?P<protocol>https?)://(?:www\.)?dumpert\.nl/(?:mediabase|embed)/(?P<id>[0-9]+/[0-9a-zA-Z]+)' _TESTS = [{ 'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/', 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', 'info_dict': { 'id': '6646981/951bc60f', 'ext': 'mp4', 'title': 'Ik heb nieuws voor je', 'description': 'Niet schrikken hoor', 'thumbnail': 're:^https?://.*\.jpg$', } }, { 'url': 'http://www.dumpert.nl/embed/6675421/dc440fe7/', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') protocol = mobj.group('protocol') url = '%s://www.dumpert.nl/mediabase/%s' % (protocol, video_id) req = sanitized_Request(url) req.add_header('Cookie', 'nsfw=1; cpc=10') webpage = self._download_webpage(req, video_id) files_base64 = self._search_regex( r'data-files="([^"]+)"', webpage, 'data files') files = self._parse_json( base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'), video_id) quality = qualities(['flv', 'mobile', 'tablet', '720p']) formats = [{ 'url': video_url, 'format_id': format_id, 'quality': quality(format_id), } for format_id, video_url in files.items() if format_id != 'still'] self._sort_formats(formats) title = self._html_search_meta( 'title', webpage) or self._og_search_title(webpage) description = self._html_search_meta( 'description', webpage) or self._og_search_description(webpage) thumbnail = files.get('still') or self._og_search_thumbnail(webpage) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'formats': formats } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/podomatic.py��������������������������������������������������������0000644�0000000�0000000�00000004427�12641030331�020364� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..utils import int_or_none class PodomaticIE(InfoExtractor): IE_NAME = 'podomatic' _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)' _TESTS = [ { 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', 'md5': '84bb855fcf3429e6bf72460e1eed782d', 'info_dict': { 'id': '2009-01-02T16_03_35-08_00', 'ext': 'mp3', 'uploader': 'Science Teaching Tips', 'uploader_id': 'scienceteachingtips', 'title': '64. When the Moon Hits Your Eye', 'duration': 446, } }, { 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', 'md5': 'd2cf443931b6148e27638650e2638297', 'info_dict': { 'id': '2013-11-15T16_31_21-08_00', 'ext': 'mp3', 'uploader': 'Ostbahnhof / Techno Mix', 'uploader_id': 'ostbahnhof', 'title': 'Einunddreizig', 'duration': 3799, } }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') channel = mobj.group('channel') json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + '?permalink=true&rtmp=0') % (mobj.group('proto'), channel, video_id)) data_json = self._download_webpage( json_url, video_id, 'Downloading video info') data = json.loads(data_json) video_url = data['downloadLink'] if not video_url: video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation']) uploader = data['podcast'] title = data['title'] thumbnail = data['imageLocation'] duration = int_or_none(data.get('length'), 1000) return { 'id': video_id, 'url': video_url, 'title': title, 'uploader': uploader, 'uploader_id': channel, 'thumbnail': thumbnail, 'duration': duration, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/ruhd.py�������������������������������������������������������������0000644�0000000�0000000�00000003066�12641030331�017345� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# -*- coding: utf-8 -*- from __future__ import unicode_literals from .common import InfoExtractor class RUHDIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' _TEST = { 'url': 'http://www.ruhd.ru/play.php?vid=207', 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', 'info_dict': { 'id': '207', 'ext': 'divx', 'title': 'КОТ бааааам', 'description': 'классный кот)', 'thumbnail': 're:^http://.*\.jpg$', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._html_search_regex( r'<param name="src" value="([^"]+)"', webpage, 'video url') title = self._html_search_regex( r'<title>([^<]+)   RUHD.ru - Видео Высокого качества №1 в России!', webpage, 'title') description = self._html_search_regex( r'(?s)
    (.+?)', webpage, 'description', fatal=False) thumbnail = self._html_search_regex( r'[0-9]+)' _TESTS = [{ 'url': 'http://www.regio-tv.de/video/395808.html', 'info_dict': { 'id': '395808', 'ext': 'mp4', 'title': 'Wir in Ludwigsburg', 'description': 'Mit unseren zuckersüßen Adventskindern, außerdem besuchen wir die Abendsterne!', } }, { 'url': 'http://www.regio-tv.de/video/395808', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) key = self._search_regex( r'key\s*:\s*(["\'])(?P.+?)\1', webpage, 'key', group='key') title = self._og_search_title(webpage) SOAP_TEMPLATE = '<{0} xmlns="http://v.telvi.de/">{1}' request = sanitized_Request( 'http://v.telvi.de/', SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8')) video_data = self._download_xml(request, video_id, 'Downloading video XML') NS_MAP = { 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', 'soap': 'http://schemas.xmlsoap.org/soap/envelope/', } video_url = xpath_text( video_data, xpath_with_ns('.//video', NS_MAP), 'video url', fatal=True) thumbnail = xpath_text( video_data, xpath_with_ns('.//image', NS_MAP), 'thumbnail') description = self._og_search_description( webpage) or self._html_search_meta('description', webpage) return { 'id': video_id, 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, } youtube-dl/youtube_dl/extractor/jadorecettepub.py0000644000000000000000000000331712641030331021402 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from .youtube import YoutubeIE class JadoreCettePubIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P.*?)\.html' _TEST = { 'url': 'http://www.jadorecettepub.com/2010/12/star-wars-massacre-par-les-japonais.html', 'md5': '401286a06067c70b44076044b66515de', 'info_dict': { 'id': 'jLMja3tr7a4', 'ext': 'mp4', 'title': 'La pire utilisation de Star Wars', 'description': "Jadorecettepub.com vous a gratifié de plusieurs pubs géniales utilisant Star Wars et Dark Vador plus particulièrement... Mais l'heure est venue de vous proposer une version totalement massacrée, venue du Japon. Quand les Japonais détruisent l'image de Star Wars pour vendre du thon en boite, ça promet...", }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') webpage = self._download_webpage(url, display_id) title = self._html_search_regex( r'(.*?)', webpage, 'title') description = self._html_search_regex( r'(?s)
    (.*?)', webpage, 'info json', flags=re.DOTALL)) youtube_id = info.get('youtubeId') if youtube_id: return self.url_result(youtube_id, 'Youtube') formats = [{ 'url': media['uri'] + '?' + info['AuthToken'], 'tbr': media['bitRate'], 'width': media['width'], 'height': media['height'], } for media in info['media'] if media.get('mediaPurpose') == 'play'] if not formats: formats.append({ 'url': info['videoUri'] }) self._sort_formats(formats) duration = int_or_none(info.get('videoLengthInSeconds')) age_limit = parse_age_limit(info.get('audienceRating')) return { 'id': video_id, 'title': info['contentName'], 'thumbnail': info['thumbUri'], 'duration': duration, 'age_limit': age_limit, 'formats': formats, } youtube-dl/youtube_dl/extractor/shahid.py0000644000000000000000000001000412645665720017655 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, parse_iso8601, ) class ShahidIE(InfoExtractor): _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P\d+)/?' _TESTS = [{ 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html', 'info_dict': { 'id': '90574', 'ext': 'mp4', 'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3', 'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان', 'duration': 2972, 'timestamp': 1422057420, 'upload_date': '20150123', }, 'params': { # m3u8 download 'skip_download': True, } }, { # shahid plus subscriber only 'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html', 'only_matching': True }] def _handle_error(self, response): if not isinstance(response, dict): return error = response.get('error') if error: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), expected=True) def _download_json(self, url, video_id, note='Downloading JSON metadata'): response = super(ShahidIE, self)._download_json(url, video_id, note)['data'] self._handle_error(response) return response def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) api_vars = { 'id': video_id, 'type': 'player', 'url': 'http://api.shahid.net/api/v1_1', 'playerType': 'episode', } flashvars = self._search_regex( r'var\s+flashvars\s*=\s*({[^}]+})', webpage, 'flashvars', default=None) if flashvars: for key in api_vars.keys(): value = self._search_regex( r'\b%s\s*:\s*(?P["\'])(?P.+?)(?P=q)' % key, flashvars, 'type', default=None, group='value') if value: api_vars[key] = value player = self._download_json( 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html' % (video_id, api_vars['type']), video_id, 'Downloading player JSON') if player.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') video = self._download_json( '%s/%s/%s?%s' % ( api_vars['url'], api_vars['playerType'], api_vars['id'], compat_urllib_parse.urlencode({ 'apiKey': 'sh@hid0nlin3', 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', })), video_id, 'Downloading video JSON') video = video[api_vars['playerType']] title = video['title'] description = video.get('description') thumbnail = video.get('thumbnailUrl') duration = int_or_none(video.get('duration')) timestamp = parse_iso8601(video.get('referenceDate')) categories = [ category['name'] for category in video.get('genres', []) if 'name' in category] return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'timestamp': timestamp, 'categories': categories, 'formats': formats, } youtube-dl/youtube_dl/extractor/veoh.py0000644000000000000000000001131212641030331017335 0ustar rootrootfrom __future__ import unicode_literals import re import json from .common import InfoExtractor from ..utils import ( int_or_none, ExtractorError, sanitized_Request, ) class VeohIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P(?:v|yapi-)[\da-zA-Z]+)' _TESTS = [ { 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', 'md5': '620e68e6a3cff80086df3348426c9ca3', 'info_dict': { 'id': '56314296', 'ext': 'mp4', 'title': 'Straight Backs Are Stronger', 'uploader': 'LUMOback', 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', }, }, { 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', 'info_dict': { 'id': '27701988', 'ext': 'mp4', 'title': 'Chile workers cover up to avoid skin damage', 'description': 'md5:2bd151625a60a32822873efc246ba20d', 'uploader': 'afp-news', 'duration': 123, }, }, { 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', 'md5': '4fde7b9e33577bab2f2f8f260e30e979', 'note': 'Embedded ooyala video', 'info_dict': { 'id': '69525809', 'ext': 'mp4', 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', 'description': 'md5:f5a11c51f8fb51d2315bca0937526891', 'uploader': 'newsy-videos', }, 'skip': 'This video has been deleted.', }, ] def _extract_formats(self, source): formats = [] link = source.get('aowPermalink') if link: formats.append({ 'url': link, 'ext': 'mp4', 'format_id': 'aow', }) link = source.get('fullPreviewHashLowPath') if link: formats.append({ 'url': link, 'format_id': 'low', }) link = source.get('fullPreviewHashHighPath') if link: formats.append({ 'url': link, 'format_id': 'high', }) return formats def _extract_video(self, source): return { 'id': source.get('videoId'), 'title': source.get('title'), 'description': source.get('description'), 'thumbnail': source.get('highResImage') or source.get('medResImage'), 'uploader': source.get('username'), 'duration': int_or_none(source.get('length')), 'view_count': int_or_none(source.get('views')), 'age_limit': 18 if source.get('isMature') == 'true' or source.get('isSexy') == 'true' else 0, 'formats': self._extract_formats(source), } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') if video_id.startswith('v'): rsp = self._download_xml( r'http://www.veoh.com/api/findByPermalink?permalink=%s' % video_id, video_id, 'Downloading video XML') stat = rsp.get('stat') if stat == 'ok': return self._extract_video(rsp.find('./videoList/video')) elif stat == 'fail': raise ExtractorError( '%s said: %s' % (self.IE_NAME, rsp.find('./errorList/error').get('errorMessage')), expected=True) webpage = self._download_webpage(url, video_id) age_limit = 0 if 'class="adultwarning-container"' in webpage: self.report_age_confirmation() age_limit = 18 request = sanitized_Request(url) request.add_header('Cookie', 'confirmedAdult=true') webpage = self._download_webpage(request, video_id) m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|"|\?)', webpage) if m_youtube is not None: youtube_id = m_youtube.group(1) self.to_screen('%s: detected Youtube video.' % video_id) return self.url_result(youtube_id, 'Youtube') info = json.loads( self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info').replace('\\\'', '\'')) video = self._extract_video(info) video['age_limit'] = age_limit return video youtube-dl/youtube_dl/extractor/moniker.py0000644000000000000000000000761312641030331020051 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import os.path import re from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, remove_start, sanitized_Request, ) class MonikerIE(InfoExtractor): IE_DESC = 'allmyvideos.net and vidspot.net' _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?:(?:2|v)/v-)?(?P[a-zA-Z0-9_-]+)' _TESTS = [{ 'url': 'http://allmyvideos.net/jih3nce3x6wn', 'md5': '710883dee1bfc370ecf9fa6a89307c88', 'info_dict': { 'id': 'jih3nce3x6wn', 'ext': 'mp4', 'title': 'youtube-dl test video', }, }, { 'url': 'http://allmyvideos.net/embed-jih3nce3x6wn', 'md5': '710883dee1bfc370ecf9fa6a89307c88', 'info_dict': { 'id': 'jih3nce3x6wn', 'ext': 'mp4', 'title': 'youtube-dl test video', }, }, { 'url': 'http://vidspot.net/l2ngsmhs8ci5', 'md5': '710883dee1bfc370ecf9fa6a89307c88', 'info_dict': { 'id': 'l2ngsmhs8ci5', 'ext': 'mp4', 'title': 'youtube-dl test video', }, }, { 'url': 'https://www.vidspot.net/l2ngsmhs8ci5', 'only_matching': True, }, { 'url': 'http://vidspot.net/2/v-ywDf99', 'md5': '5f8254ce12df30479428b0152fb8e7ba', 'info_dict': { 'id': 'ywDf99', 'ext': 'mp4', 'title': 'IL FAIT LE MALIN EN PORSHE CAYENNE ( mais pas pour longtemps)', 'description': 'IL FAIT LE MALIN EN PORSHE CAYENNE.', }, }, { 'url': 'http://allmyvideos.net/v/v-HXZm5t', 'only_matching': True, }] def _real_extract(self, url): orig_video_id = self._match_id(url) video_id = remove_start(orig_video_id, 'embed-') url = url.replace(orig_video_id, video_id) assert re.match(self._VALID_URL, url) is not None orig_webpage = self._download_webpage(url, video_id) if '>File Not Found<' in orig_webpage: raise ExtractorError('Video %s does not exist' % video_id, expected=True) error = self._search_regex( r'class="err">([^<]+)<', orig_webpage, 'error', default=None) if error: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, error), expected=True) builtin_url = self._search_regex( r']+src=(["\'])(?P.+?/builtin-.+?)\1', orig_webpage, 'builtin URL', default=None, group='url') if builtin_url: req = sanitized_Request(builtin_url) req.add_header('Referer', url) webpage = self._download_webpage(req, video_id, 'Downloading builtin page') title = self._og_search_title(orig_webpage).strip() description = self._og_search_description(orig_webpage).strip() else: fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) data = dict(fields) post = compat_urllib_parse.urlencode(data) headers = { b'Content-Type': b'application/x-www-form-urlencoded', } req = sanitized_Request(url, post, headers) webpage = self._download_webpage( req, video_id, note='Downloading video page ...') title = os.path.splitext(data['fname'])[0] description = None # Could be several links with different quality links = re.findall(r'"file" : "?(.+?)",', webpage) # Assume the links are ordered in quality formats = [{ 'url': l, 'quality': i, } for i, l in enumerate(links)] self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'formats': formats, } youtube-dl/youtube_dl/extractor/sandia.py0000644000000000000000000000774112641030331017646 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import itertools import json import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, mimetype2ext, sanitized_Request, unified_strdate, ) class SandiaIE(InfoExtractor): IE_DESC = 'Sandia National Laboratories' _VALID_URL = r'https?://digitalops\.sandia\.gov/Mediasite/Play/(?P[0-9a-f]+)' _TEST = { 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', 'md5': '9422edc9b9a60151727e4b6d8bef393d', 'info_dict': { 'id': '24aace4429fc450fb5b38cdbf424a66e1d', 'ext': 'mp4', 'title': 'Xyce Software Training - Section 1', 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', 'upload_date': '20120904', 'duration': 7794, } } def _real_extract(self, url): video_id = self._match_id(url) req = sanitized_Request(url) req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4') webpage = self._download_webpage(req, video_id) js_path = self._search_regex( r'', embed_page, 'embed vars') info = self._parse_json(embed_vars_json, video_id) formats = [] for media in info['media']: if media['mediaPurpose'] == 'play': formats.append({ 'url': media['uri'], 'height': media['height'], 'width:': media['width'], }) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': title, 'formats': formats, 'thumbnail': info.get('thumbUri'), 'description': self._og_search_description(webpage), 'duration': int_or_none(info.get('videoLengthInSeconds')), 'age_limit': parse_age_limit(info.get('audienceRating')), } youtube-dl/youtube_dl/extractor/huffpost.py0000644000000000000000000000505712641030331020243 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( parse_duration, unified_strdate, ) class HuffPostIE(InfoExtractor): IE_DESC = 'Huffington Post' _VALID_URL = r'''(?x) https?://(embed\.)?live\.huffingtonpost\.com/ (?: r/segment/[^/]+/| HPLEmbedPlayer/\?segmentId= ) (?P[0-9a-f]+)''' _TEST = { 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', 'md5': '55f5e8981c1c80a64706a44b74833de8', 'info_dict': { 'id': '52dd3e4b02a7602131000677', 'ext': 'mp4', 'title': 'Legalese It! with @MikeSacksHP', 'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ', 'duration': 1549, 'upload_date': '20140124', } } def _real_extract(self, url): video_id = self._match_id(url) api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id data = self._download_json(api_url, video_id)['data'] video_title = data['title'] duration = parse_duration(data.get('running_time')) upload_date = unified_strdate( data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time')) description = data.get('description') thumbnails = [] for url in data['images'].values(): m = re.match('.*-([0-9]+x[0-9]+)\.', url) if not m: continue thumbnails.append({ 'url': url, 'resolution': m.group(1), }) formats = [{ 'format': key, 'format_id': key.replace('/', '.'), 'ext': 'mp4', 'url': url, 'vcodec': 'none' if key.startswith('audio/') else None, } for key, url in data.get('sources', {}).get('live', {}).items()] if not formats and data.get('fivemin_id'): return self.url_result('5min:%s' % data['fivemin_id']) self._sort_formats(formats) return { 'id': video_id, 'title': video_title, 'description': description, 'formats': formats, 'duration': duration, 'upload_date': upload_date, 'thumbnails': thumbnails, } youtube-dl/youtube_dl/extractor/tenplay.py0000644000000000000000000000710512660177411020071 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, float_or_none, ) class TenPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ten(play)?\.com\.au/.+' _TEST = { 'url': 'http://tenplay.com.au/ten-insider/extra/season-2013/tenplay-tv-your-way', 'info_dict': { 'id': '2695695426001', 'ext': 'flv', 'title': 'TENplay: TV your way', 'description': 'Welcome to a new TV experience. Enjoy a taste of the TENplay benefits.', 'timestamp': 1380150606.889, 'upload_date': '20130925', 'uploader': 'TENplay', }, 'params': { 'skip_download': True, # Requires rtmpdump } } _video_fields = [ 'id', 'name', 'shortDescription', 'longDescription', 'creationDate', 'publishedDate', 'lastModifiedDate', 'customFields', 'videoStillURL', 'thumbnailURL', 'referenceId', 'length', 'playsTotal', 'playsTrailingWeek', 'renditions', 'captioning', 'startDate', 'endDate'] def _real_extract(self, url): webpage = self._download_webpage(url, url) video_id = self._html_search_regex( r'videoID: "(\d+?)"', webpage, 'video_id') api_token = self._html_search_regex( r'apiToken: "([a-zA-Z0-9-_\.]+?)"', webpage, 'api_token') title = self._html_search_regex( r'', webpage, 'title') json = self._download_json('https://api.brightcove.com/services/library?command=find_video_by_id&video_id=%s&token=%s&video_fields=%s' % (video_id, api_token, ','.join(self._video_fields)), title) formats = [] for rendition in json['renditions']: url = rendition['remoteUrl'] or rendition['url'] protocol = 'rtmp' if url.startswith('rtmp') else 'http' ext = 'flv' if protocol == 'rtmp' else rendition['videoContainer'].lower() if protocol == 'rtmp': url = url.replace('&mp4:', '') tbr = int_or_none(rendition.get('encodingRate'), 1000) formats.append({ 'format_id': '_'.join( ['rtmp', rendition['videoContainer'].lower(), rendition['videoCodec'].lower(), '%sk' % tbr]), 'width': int_or_none(rendition['frameWidth']), 'height': int_or_none(rendition['frameHeight']), 'tbr': tbr, 'filesize': int_or_none(rendition['size']), 'protocol': protocol, 'ext': ext, 'vcodec': rendition['videoCodec'].lower(), 'container': rendition['videoContainer'].lower(), 'url': url, }) self._sort_formats(formats) return { 'id': video_id, 'display_id': json['referenceId'], 'title': json['name'], 'description': json['shortDescription'] or json['longDescription'], 'formats': formats, 'thumbnails': [{ 'url': json['videoStillURL'] }, { 'url': json['thumbnailURL'] }], 'thumbnail': json['videoStillURL'], 'duration': float_or_none(json.get('length'), 1000), 'timestamp': float_or_none(json.get('creationDate'), 1000), 'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay', 'view_count': int_or_none(json.get('playsTotal')), } youtube-dl/youtube_dl/extractor/trutube.py0000644000000000000000000000157412650650456020117 0ustar rootrootfrom __future__ import unicode_literals from .nuevo import NuevoBaseIE class TruTubeIE(NuevoBaseIE): _VALID_URL = r'https?://(?:www\.)?trutube\.tv/(?:video/|nuevo/player/embed\.php\?v=)(?P\d+)' _TESTS = [{ 'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-', 'md5': 'c5b6e301b0a2040b074746cbeaa26ca1', 'info_dict': { 'id': '14880', 'ext': 'flv', 'title': 'Ramses II - Proven To Be A Red Headed Caucasoid', 'thumbnail': 're:^http:.*\.jpg$', } }, { 'url': 'https://trutube.tv/nuevo/player/embed.php?v=14880', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) return self._extract_nuevo( 'https://trutube.tv/nuevo/player/config.php?v=%s' % video_id, video_id) youtube-dl/youtube_dl/extractor/tagesschau.py0000644000000000000000000001460212641030331020530 0ustar rootroot# -*- coding: utf-8 -*- from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import parse_filesize class TagesschauIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', 'md5': '917a228bc7df7850783bc47979673a09', 'info_dict': { 'id': '102143', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', 'description': 'md5:171feccd9d9b3dd54d05d501568f6359', 'thumbnail': 're:^https?:.*\.jpg$', }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', 'md5': '3c54c1f6243d279b706bde660ceec633', 'info_dict': { 'id': '5727', 'ext': 'mp4', 'description': 'md5:695c01bfd98b7e313c501386327aea59', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', 'thumbnail': 're:^https?:.*\.jpg$', }, }, { 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', 'md5': 'aef45de271c4bf0a5db834aa40bf774c', 'info_dict': { 'id': '18407', 'ext': 'mp3', 'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', 'thumbnail': 're:^https?:.*\.jpg$', }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', 'only_matching': True, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html', 'only_matching': True, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html', 'only_matching': True, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html', 'only_matching': True, }, { 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html', 'only_matching': True, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html', 'only_matching': True, }, { 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 'only_matching': True, }] _FORMATS = { 's': {'width': 256, 'height': 144, 'quality': 1}, 'm': {'width': 512, 'height': 288, 'quality': 2}, 'l': {'width': 960, 'height': 544, 'quality': 3}, } def _real_extract(self, url): video_id = self._match_id(url) display_id = video_id.lstrip('-') webpage = self._download_webpage(url, display_id) player_url = self._html_search_meta( 'twitter:player', webpage, 'player URL', default=None) if player_url: playerpage = self._download_webpage( player_url, display_id, 'Downloading player page') formats = [] for media in re.finditer( r'''(?x) (?P["\'])(?Phttp://media.+?)(?P=q_url) ,\s*type:(?P["\'])(?Pvideo|audio)/(?P.+?)(?P=q_type) (?:,\s*quality:(?P["\'])(?P.+?)(?P=q_quality))? ''', playerpage): url = media.group('url') type_ = media.group('type') ext = media.group('ext') res = media.group('quality') f = { 'format_id': '%s_%s' % (res, ext) if res else ext, 'url': url, 'ext': ext, 'vcodec': 'none' if type_ == 'audio' else None, } f.update(self._FORMATS.get(res, {})) formats.append(f) thumbnail = self._og_search_thumbnail(playerpage) title = self._og_search_title(webpage).strip() description = self._og_search_description(webpage).strip() else: download_text = self._search_regex( r'(?s)

    Wir bieten dieses Video in folgenden Formaten zum Download an:

    \s*
    (.*?)
    \s*

    ', webpage, 'download links') links = re.finditer( r'

    ', download_text) formats = [] for l in links: format_id = self._search_regex( r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') format = { 'format_id': format_id, 'url': l.group('url'), 'format_name': l.group('name'), } m = re.match( r'''(?x) Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; (?P[0-9]+)x(?P[0-9]+)px&\#10; (?P[0-9]+)kbps&\#10; Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', l.group('title')) if m: format.update({ 'format_note': m.group('audio_desc'), 'vcodec': m.group('vcodec'), 'width': int(m.group('width')), 'height': int(m.group('height')), 'abr': int(m.group('abr')), 'vbr': int(m.group('vbr')), 'filesize_approx': parse_filesize(m.group('filesize_approx')), }) formats.append(format) thumbnail = self._og_search_thumbnail(webpage) description = self._html_search_regex( r'(?s)

    (.*?)

    ', webpage, 'description', default=None) title = self._html_search_regex( r'(.*?)', webpage, 'title') self._sort_formats(formats) return { 'id': display_id, 'title': title, 'thumbnail': thumbnail, 'formats': formats, 'description': description, } youtube-dl/youtube_dl/extractor/audiomack.py0000644000000000000000000001323712641030331020341 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import itertools import time from .common import InfoExtractor from .soundcloud import SoundcloudIE from ..utils import ( ExtractorError, url_basename, ) class AudiomackIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P[\w/-]+)' IE_NAME = 'audiomack' _TESTS = [ # hosted on audiomack { 'url': 'http://www.audiomack.com/song/roosh-williams/extraordinary', 'info_dict': { 'id': '310086', 'ext': 'mp3', 'uploader': 'Roosh Williams', 'title': 'Extraordinary' } }, # audiomack wrapper around soundcloud song { 'add_ie': ['Soundcloud'], 'url': 'http://www.audiomack.com/song/xclusiveszone/take-kare', 'info_dict': { 'id': '172419696', 'ext': 'mp3', 'description': 'md5:1fc3272ed7a635cce5be1568c2822997', 'title': 'Young Thug ft Lil Wayne - Take Kare', 'uploader': 'Young Thug World', 'upload_date': '20141016', } }, ] def _real_extract(self, url): # URLs end with [uploader name]/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response album_url_tag = self._match_id(url) # Request the extended version of the api for extra fields like artist and title api_response = self._download_json( 'http://www.audiomack.com/api/music/url/song/%s?extended=1&_=%d' % ( album_url_tag, time.time()), album_url_tag) # API is inconsistent with errors if 'url' not in api_response or not api_response['url'] or 'error' in api_response: raise ExtractorError('Invalid url %s' % url) # Audiomack wraps a lot of soundcloud tracks in their branded wrapper # if so, pass the work off to the soundcloud extractor if SoundcloudIE.suitable(api_response['url']): return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'} return { 'id': api_response.get('id', album_url_tag), 'uploader': api_response.get('artist'), 'title': api_response.get('title'), 'url': api_response['url'], } class AudiomackAlbumIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P[\w/-]+)' IE_NAME = 'audiomack:album' _TESTS = [ # Standard album playlist { 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape', 'playlist_count': 15, 'info_dict': { 'id': '812251', 'title': 'Tha Tour: Part 2 (Official Mixtape)' } }, # Album playlist ripped from fakeshoredrive with no metadata { 'url': 'http://www.audiomack.com/album/fakeshoredrive/ppp-pistol-p-project', 'info_dict': { 'title': 'PPP (Pistol P Project)', 'id': '837572', }, 'playlist': [{ 'info_dict': { 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)', 'id': '837577', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', } }], 'params': { 'playliststart': 9, 'playlistend': 9, } } ] def _real_extract(self, url): # URLs end with [uploader name]/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response album_url_tag = self._match_id(url) result = {'_type': 'playlist', 'entries': []} # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata # Therefore we don't know how many songs the album has and must infi-loop until failure for track_no in itertools.count(): # Get song's metadata api_response = self._download_json( 'http://www.audiomack.com/api/music/url/album/%s/%d?extended=1&_=%d' % (album_url_tag, track_no, time.time()), album_url_tag, note='Querying song information (%d)' % (track_no + 1)) # Total failure, only occurs when url is totally wrong # Won't happen in middle of valid playlist (next case) if 'url' not in api_response or 'error' in api_response: raise ExtractorError('Invalid url for track %d of album url %s' % (track_no, url)) # URL is good but song id doesn't exist - usually means end of playlist elif not api_response['url']: break else: # Pull out the album metadata and add to result (if it exists) for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: if apikey in api_response and resultkey not in result: result[resultkey] = api_response[apikey] song_id = url_basename(api_response['url']).rpartition('.')[0] result['entries'].append({ 'id': api_response.get('id', song_id), 'uploader': api_response.get('artist'), 'title': api_response.get('title', song_id), 'url': api_response['url'], }) return result youtube-dl/youtube_dl/extractor/cbssports.py0000644000000000000000000000215112641030331020417 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor class CBSSportsIE(InfoExtractor): _VALID_URL = r'http://www\.cbssports\.com/video/player/(?P
    [^/]+)/(?P[^/]+)' _TEST = { 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s', 'info_dict': { 'id': '_d5_GbO8p1sT', 'ext': 'flv', 'title': 'US Open flashbacks: 1990s', 'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) section = mobj.group('section') video_id = mobj.group('id') all_videos = self._download_json( 'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section, video_id) # The json file contains the info of all the videos in the section video_info = next(v for v in all_videos if v['pcid'] == video_id) return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform') youtube-dl/youtube_dl/extractor/cinchcast.py0000644000000000000000000000321612641030331020337 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( unified_strdate, xpath_text, ) class CinchcastIE(InfoExtractor): _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P[0-9]+)' _TEST = { # Actual test is run in generic, look for undergroundwellness 'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703', 'only_matching': True, } def _real_extract(self, url): video_id = self._match_id(url) doc = self._download_xml( 'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id, video_id) item = doc.find('.//item') title = xpath_text(item, './title', fatal=True) date_str = xpath_text( item, './{http://developer.longtailvideo.com/trac/}date') upload_date = unified_strdate(date_str, day_first=False) # duration is present but wrong formats = [{ 'format_id': 'main', 'url': item.find('./{http://search.yahoo.com/mrss/}content').attrib['url'], }] backup_url = xpath_text( item, './{http://developer.longtailvideo.com/trac/}backupContent') if backup_url: formats.append({ 'preference': 2, # seems to be more reliable 'format_id': 'backup', 'url': backup_url, }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'upload_date': upload_date, 'formats': formats, } youtube-dl/youtube_dl/extractor/sexu.py0000644000000000000000000000361212641030331017364 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor class SexuIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?sexu\.com/(?P\d+)' _TEST = { 'url': 'http://sexu.com/961791/', 'md5': 'ff615aca9691053c94f8f10d96cd7884', 'info_dict': { 'id': '961791', 'ext': 'mp4', 'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b', 'description': 'md5:c5ed8625eb386855d5a7967bd7b77a54', 'categories': list, # NSFW 'thumbnail': 're:https?://.*\.jpg$', 'age_limit': 18, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) quality_arr = self._search_regex( r'sources:\s*\[([^\]]+)\]', webpage, 'forrmat string') formats = [{ 'url': fmt[0].replace('\\', ''), 'format_id': fmt[1], 'height': int(fmt[1][:3]), } for fmt in re.findall(r'"file":"([^"]+)","label":"([^"]+)"', quality_arr)] self._sort_formats(formats) title = self._html_search_regex( r'([^<]+)\s*-\s*Sexu\.Com', webpage, 'title') description = self._html_search_meta( 'description', webpage, 'description') thumbnail = self._html_search_regex( r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) categories_str = self._html_search_meta( 'keywords', webpage, 'categories') categories = ( None if categories_str is None else categories_str.split(',')) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'categories': categories, 'formats': formats, 'age_limit': 18, } youtube-dl/youtube_dl/extractor/rts.py0000644000000000000000000002207112641247326017226 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .srgssr import SRGSSRIE from ..compat import ( compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( int_or_none, parse_duration, parse_iso8601, unescapeHTML, xpath_text, ) class RTSIE(SRGSSRIE): IE_DESC = 'RTS.ch' _VALID_URL = r'rts:(?P\d+)|https?://(?:www\.)?rts\.ch/(?:[^/]+/){2,}(?P[0-9]+)-(?P.+?)\.html' _TESTS = [ { 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', 'md5': 'f254c4b26fb1d3c183793d52bc40d3e7', 'info_dict': { 'id': '3449373', 'display_id': 'les-enfants-terribles', 'ext': 'mp4', 'duration': 1488, 'title': 'Les Enfants Terribles', 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.', 'uploader': 'Divers', 'upload_date': '19680921', 'timestamp': -40280400, 'thumbnail': 're:^https?://.*\.image', 'view_count': int, }, 'params': { # m3u8 download 'skip_download': True, } }, { 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', 'md5': 'f1077ac5af686c76528dc8d7c5df29ba', 'info_dict': { 'id': '5742494', 'display_id': '5742494', 'ext': 'mp4', 'duration': 3720, 'title': 'Les yeux dans les cieux - Mon homard au Canada', 'description': 'md5:d22ee46f5cc5bac0912e5a0c6d44a9f7', 'uploader': 'Passe-moi les jumelles', 'upload_date': '20140404', 'timestamp': 1396635300, 'thumbnail': 're:^https?://.*\.image', 'view_count': int, }, 'params': { # m3u8 download 'skip_download': True, } }, { 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html', 'md5': 'b4326fecd3eb64a458ba73c73e91299d', 'info_dict': { 'id': '5745975', 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski', 'ext': 'mp4', 'duration': 48, 'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski', 'description': 'Hockey - Playoff', 'uploader': 'Hockey', 'upload_date': '20140403', 'timestamp': 1396556882, 'thumbnail': 're:^https?://.*\.image', 'view_count': int, }, 'skip': 'Blocked outside Switzerland', }, { 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', 'md5': '9f713382f15322181bb366cc8c3a4ff0', 'info_dict': { 'id': '5745356', 'display_id': 'londres-cachee-par-un-epais-smog', 'ext': 'mp4', 'duration': 33, 'title': 'Londres cachée par un épais smog', 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.', 'uploader': 'Le Journal en continu', 'upload_date': '20140403', 'timestamp': 1396537322, 'thumbnail': 're:^https?://.*\.image', 'view_count': int, }, 'params': { # m3u8 download 'skip_download': True, } }, { 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', 'md5': 'dd8ef6a22dff163d063e2a52bc8adcae', 'info_dict': { 'id': '5706148', 'display_id': 'urban-hippie-de-damien-krisl-03-04-2014', 'ext': 'mp3', 'duration': 123, 'title': '"Urban Hippie", de Damien Krisl', 'description': 'Des Hippies super glam.', 'upload_date': '20140403', 'timestamp': 1396551600, }, }, { # article with videos on rhs 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html', 'info_dict': { 'id': '6693917', 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', }, 'playlist_mincount': 5, } ] def _real_extract(self, url): m = re.match(self._VALID_URL, url) media_id = m.group('rts_id') or m.group('id') display_id = m.group('display_id') or media_id def download_json(internal_id): return self._download_json( 'http://www.rts.ch/a/%s.html?f=json/article' % internal_id, display_id) all_info = download_json(media_id) # media_id extracted out of URL is not always a real id if 'video' not in all_info and 'audio' not in all_info: page = self._download_webpage(url, display_id) # article with videos on rhs videos = re.findall( r']+class="content-item"[^>]*>\s*]+data-video-urn="urn:([^"]+)"', page) if not videos: videos = re.findall( r'(?s)]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', page) if videos: entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] return self.playlist_result(entries, media_id, self._og_search_title(page)) internal_id = self._html_search_regex( r'<(?:video|audio) data-id="([0-9]+)"', page, 'internal video id') all_info = download_json(internal_id) media_type = 'video' if 'video' in all_info else 'audio' # check for errors self.get_media_data('rts', media_type, media_id) info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] upload_timestamp = parse_iso8601(info.get('broadcast_date')) duration = info.get('duration') or info.get('cutout') or info.get('cutduration') if isinstance(duration, compat_str): duration = parse_duration(duration) view_count = info.get('plays') thumbnail = unescapeHTML(info.get('preview_image_url')) def extract_bitrate(url): return int_or_none(self._search_regex( r'-([0-9]+)k\.', url, 'bitrate', default=None)) formats = [] for format_id, format_url in info['streams'].items(): if format_id == 'hds_sd' and 'hds' in info['streams']: continue if format_id == 'hls_sd' and 'hls' in info['streams']: continue if format_url.endswith('.f4m'): token = self._download_xml( 'http://tp.srgssr.ch/token/akahd.xml?stream=%s/*' % compat_urllib_parse_urlparse(format_url).path, media_id, 'Downloading %s token' % format_id) auth_params = xpath_text(token, './/authparams', 'auth params') if not auth_params: continue formats.extend(self._extract_f4m_formats( '%s?%s&hdcore=3.4.0&plugin=aasp-3.4.0.132.66' % (format_url, auth_params), media_id, f4m_id=format_id, fatal=False)) elif format_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats( format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) else: formats.append({ 'format_id': format_id, 'url': format_url, 'tbr': extract_bitrate(format_url), }) if 'media' in info: formats.extend([{ 'format_id': '%s-%sk' % (media['ext'], media['rate']), 'url': 'http://download-video.rts.ch/%s' % media['url'], 'tbr': media['rate'] or extract_bitrate(media['url']), } for media in info['media'] if media.get('rate')]) self._check_formats(formats, media_id) self._sort_formats(formats) return { 'id': media_id, 'display_id': display_id, 'formats': formats, 'title': info['title'], 'description': info.get('intro'), 'duration': duration, 'view_count': view_count, 'uploader': info.get('programName'), 'timestamp': upload_timestamp, 'thumbnail': thumbnail, } youtube-dl/youtube_dl/extractor/planetaplay.py0000644000000000000000000000360112641030331020710 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ExtractorError class PlanetaPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?planetaplay\.com/\?sng=(?P[0-9]+)' _API_URL = 'http://planetaplay.com/action/playlist/?sng={0:}' _THUMBNAIL_URL = 'http://planetaplay.com/img/thumb/{thumb:}' _TEST = { 'url': 'http://planetaplay.com/?sng=3586', 'md5': '9d569dceb7251a4e01355d5aea60f9db', 'info_dict': { 'id': '3586', 'ext': 'flv', 'title': 'md5:e829428ee28b1deed00de90de49d1da1', }, 'skip': 'Not accessible from Travis CI server', } _SONG_FORMATS = { 'lq': (0, 'http://www.planetaplay.com/videoplayback/{med_hash:}'), 'hq': (1, 'http://www.planetaplay.com/videoplayback/hi/{med_hash:}'), } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') response = self._download_json( self._API_URL.format(video_id), video_id)['response'] try: data = response.get('data')[0] except IndexError: raise ExtractorError( '%s: failed to get the playlist' % self.IE_NAME, expected=True) title = '{song_artists:} - {sng_name:}'.format(**data) thumbnail = self._THUMBNAIL_URL.format(**data) formats = [] for format_id, (quality, url_template) in self._SONG_FORMATS.items(): formats.append({ 'format_id': format_id, 'url': url_template.format(**data), 'quality': quality, 'ext': 'flv', }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, } youtube-dl/youtube_dl/extractor/aftonbladet.py0000644000000000000000000000520112641030331020657 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import int_or_none class AftonbladetIE(InfoExtractor): _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' _TEST = { 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { 'id': '36015', 'ext': 'mp4', 'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna', 'description': 'Jupiters måne mest aktiv av alla himlakroppar', 'timestamp': 1394142732, 'upload_date': '20140306', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) # find internal video meta data meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' player_config = self._parse_json(self._html_search_regex( r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) internal_meta_id = player_config['videoId'] internal_meta_url = meta_url % internal_meta_id internal_meta_json = self._download_json( internal_meta_url, video_id, 'Downloading video meta data') # find internal video formats format_url = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s' internal_video_id = internal_meta_json['videoId'] internal_formats_url = format_url % internal_video_id internal_formats_json = self._download_json( internal_formats_url, video_id, 'Downloading video formats') formats = [] for fmt in internal_formats_json['formats']['http']['pseudostreaming']['mp4']: p = fmt['paths'][0] formats.append({ 'url': 'http://%s:%d/%s/%s' % (p['address'], p['port'], p['path'], p['filename']), 'ext': 'mp4', 'width': int_or_none(fmt.get('width')), 'height': int_or_none(fmt.get('height')), 'tbr': int_or_none(fmt.get('bitrate')), 'protocol': 'http', }) self._sort_formats(formats) return { 'id': video_id, 'title': internal_meta_json['title'], 'formats': formats, 'thumbnail': internal_meta_json.get('imageUrl'), 'description': internal_meta_json.get('shortPreamble'), 'timestamp': int_or_none(internal_meta_json.get('timePublished')), 'duration': int_or_none(internal_meta_json.get('duration')), 'view_count': int_or_none(internal_meta_json.get('views')), } youtube-dl/youtube_dl/extractor/viewster.py0000644000000000000000000001524312650650456020273 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_urllib_parse, compat_urllib_parse_unquote, ) from ..utils import ( determine_ext, ExtractorError, int_or_none, parse_iso8601, sanitized_Request, HEADRequest, ) class ViewsterIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?viewster\.com/(?:serie|movie)/(?P\d+-\d+-\d+)' _TESTS = [{ # movie, Type=Movie 'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/', 'md5': 'e642d1b27fcf3a4ffa79f194f5adde36', 'info_dict': { 'id': '1140-11855-000', 'ext': 'mp4', 'title': 'The listening Project', 'description': 'md5:bac720244afd1a8ea279864e67baa071', 'timestamp': 1214870400, 'upload_date': '20080701', 'duration': 4680, }, }, { # series episode, Type=Episode 'url': 'http://www.viewster.com/serie/1284-19427-001/the-world-and-a-wall/', 'md5': '9243079a8531809efe1b089db102c069', 'info_dict': { 'id': '1284-19427-001', 'ext': 'mp4', 'title': 'The World and a Wall', 'description': 'md5:24814cf74d3453fdf5bfef9716d073e3', 'timestamp': 1428192000, 'upload_date': '20150405', 'duration': 1500, }, }, { # serie, Type=Serie 'url': 'http://www.viewster.com/serie/1303-19426-000/', 'info_dict': { 'id': '1303-19426-000', 'title': 'Is It Wrong to Try to Pick up Girls in a Dungeon?', 'description': 'md5:eeda9bef25b0d524b3a29a97804c2f11', }, 'playlist_count': 13, }, { # unfinished serie, no Type 'url': 'http://www.viewster.com/serie/1284-19427-000/baby-steps-season-2/', 'info_dict': { 'id': '1284-19427-000', 'title': 'Baby Steps—Season 2', 'description': 'md5:e7097a8fc97151e25f085c9eb7a1cdb1', }, 'playlist_mincount': 16, }, { # geo restricted series 'url': 'https://www.viewster.com/serie/1280-18794-002/', 'only_matching': True, }, { # geo restricted video 'url': 'https://www.viewster.com/serie/1280-18794-002/what-is-extraterritoriality-lawo/', 'only_matching': True, }] _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True): request = sanitized_Request(url) request.add_header('Accept', self._ACCEPT_HEADER) request.add_header('Auth-token', self._AUTH_TOKEN) return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal) def _real_extract(self, url): video_id = self._match_id(url) # Get 'api_token' cookie self._request_webpage(HEADRequest('http://www.viewster.com/'), video_id) cookies = self._get_cookies('http://www.viewster.com/') self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value) info = self._download_json( 'https://public-api.viewster.com/search/%s' % video_id, video_id, 'Downloading entry JSON') entry_id = info.get('Id') or info['id'] # unfinished serie has no Type if info.get('Type') in ('Serie', None): try: episodes = self._download_json( 'https://public-api.viewster.com/series/%s/episodes' % entry_id, video_id, 'Downloading series JSON') except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: self.raise_geo_restricted() else: raise entries = [ self.url_result( 'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster') for episode in episodes] title = (info.get('Title') or info['Synopsis']['Title']).strip() description = info.get('Synopsis', {}).get('Detailed') return self.playlist_result(entries, video_id, title, description) formats = [] for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): media = self._download_json( 'https://public-api.viewster.com/movies/%s/video?mediaType=%s' % (entry_id, compat_urllib_parse.quote(media_type)), video_id, 'Downloading %s JSON' % media_type, fatal=False) if not media: continue video_url = media.get('Uri') if not video_url: continue ext = determine_ext(video_url) if ext == 'f4m': video_url += '&' if '?' in video_url else '?' video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds')) elif ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) # m3u8 sometimes fail if m3u8_formats: formats.extend(m3u8_formats) else: format_id = media.get('Bitrate') f = { 'url': video_url, 'format_id': 'mp4-%s' % format_id, 'height': int_or_none(media.get('Height')), 'width': int_or_none(media.get('Width')), 'preference': 1, } if format_id and not f['height']: f['height'] = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) formats.append(f) if not formats and not info.get('LanguageSets') and not info.get('VODSettings'): self.raise_geo_restricted() self._sort_formats(formats) synopsis = info.get('Synopsis') or {} # Prefer title outside synopsis since it's less messy title = (info.get('Title') or synopsis['Title']).strip() description = synopsis.get('Detailed') or (info.get('Synopsis') or {}).get('Short') duration = int_or_none(info.get('Duration')) timestamp = parse_iso8601(info.get('ReleaseDate')) return { 'id': video_id, 'title': title, 'description': description, 'timestamp': timestamp, 'duration': duration, 'formats': formats, } youtube-dl/youtube_dl/extractor/mlb.py0000644000000000000000000001571412641030331017160 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, ) class MLBIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?:[\da-z_-]+\.)*mlb\.com/ (?: (?: (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| )\?.*?\bcontent_id= ) (?Pn?\d+)| (?:[^/]+/)*(?P[^/]+) ) ''' _TESTS = [ { 'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', 'md5': 'ff56a598c2cf411a9a38a69709e97079', 'info_dict': { 'id': '34698933', 'ext': 'mp4', 'title': "Ackley's spectacular catch", 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', 'duration': 66, 'timestamp': 1405980600, 'upload_date': '20140721', 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', 'md5': 'd9c022c10d21f849f49c05ae12a8a7e9', 'info_dict': { 'id': '34496663', 'ext': 'mp4', 'title': 'Stanton prepares for Derby', 'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57', 'duration': 46, 'timestamp': 1405105800, 'upload_date': '20140711', 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby', 'md5': '0e6e73d509321e142409b695eadd541f', 'info_dict': { 'id': '34578115', 'ext': 'mp4', 'title': 'Cespedes repeats as Derby champ', 'description': 'md5:08df253ce265d4cf6fb09f581fafad07', 'duration': 488, 'timestamp': 1405399936, 'upload_date': '20140715', 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance', 'md5': 'b8fd237347b844365d74ea61d4245967', 'info_dict': { 'id': '34577915', 'ext': 'mp4', 'title': 'Bautista on Home Run Derby', 'description': 'md5:b80b34031143d0986dddc64a8839f0fb', 'duration': 52, 'timestamp': 1405390722, 'upload_date': '20140715', 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', 'md5': 'b190e70141fb9a1552a85426b4da1b5d', 'info_dict': { 'id': '75609783', 'ext': 'mp4', 'title': 'Must C: Pillar climbs for catch', 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', 'timestamp': 1429124820, 'upload_date': '20150415', } }, { 'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb', 'only_matching': True, }, { 'url': 'http://mlb.mlb.com/shared/video/embed/embed.html?content_id=36599553', 'only_matching': True, }, { 'url': 'http://mlb.mlb.com/es/video/play.jsp?content_id=36599553', 'only_matching': True, }, { 'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728', 'only_matching': True, }, { # From http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer 'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb', 'only_matching': True, }, { 'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#', 'only_matching': True, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') if not video_id: video_path = mobj.group('path') webpage = self._download_webpage(url, video_path) video_id = self._search_regex( [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id') detail = self._download_xml( 'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id) title = detail.find('./headline').text description = detail.find('./big-blurb').text duration = parse_duration(detail.find('./duration').text) timestamp = parse_iso8601(detail.attrib['date'][:-5]) thumbnails = [{ 'url': thumbnail.text, } for thumbnail in detail.findall('./thumbnailScenarios/thumbnailScenario')] formats = [] for media_url in detail.findall('./url'): playback_scenario = media_url.attrib['playback_scenario'] fmt = { 'url': media_url.text, 'format_id': playback_scenario, } m = re.search(r'(?P\d+)K_(?P\d+)X(?P\d+)', playback_scenario) if m: fmt.update({ 'vbr': int(m.group('vbr')) * 1000, 'width': int(m.group('width')), 'height': int(m.group('height')), }) formats.append(fmt) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'duration': duration, 'timestamp': timestamp, 'formats': formats, 'thumbnails': thumbnails, } youtube-dl/youtube_dl/extractor/spiegel.py0000644000000000000000000001305112641030331020026 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urlparse from .spiegeltv import SpiegeltvIE class SpiegelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', 'md5': '2c2754212136f35fb4b19767d242f66e', 'info_dict': { 'id': '1259285', 'ext': 'mp4', 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 49, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', 'md5': 'f2cdf638d7aa47654e251e1aee360af1', 'info_dict': { 'id': '1309159', 'ext': 'mp4', 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', 'duration': 983, }, }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', 'md5': 'd8eeca6bfc8f1cd6f490eb1f44695d51', 'info_dict': { 'id': '1519126', 'ext': 'mp4', 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage, handle = self._download_webpage_handle(url, video_id) # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') title = re.sub(r'\s+', ' ', self._html_search_regex( r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)', webpage, 'title')) description = self._html_search_meta('description', webpage, 'description') base_url = self._search_regex( [r'server\s*:\s*(["\'])(?P.+?)\1', r'var\s+server\s*=\s*"(?P[^"]+)\"'], webpage, 'server URL', group='url') xml_url = base_url + video_id + '.xml' idoc = self._download_xml(xml_url, video_id) formats = [] for n in list(idoc): if n.tag.startswith('type') and n.tag != 'type6': format_id = n.tag.rpartition('type')[2] video_url = base_url + n.find('./filename').text formats.append({ 'format_id': format_id, 'url': video_url, 'width': int(n.find('./width').text), 'height': int(n.find('./height').text), 'abr': int(n.find('./audiobitrate').text), 'vbr': int(n.find('./videobitrate').text), 'vcodec': n.find('./codec').text, 'acodec': 'MP4A', }) duration = float(idoc[0].findall('./duration')[0].text) self._check_formats(formats, video_id) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'duration': duration, 'formats': formats, } class SpiegelArticleIE(InfoExtractor): _VALID_URL = 'https?://www\.spiegel\.de/(?!video/)[^?#]*?-(?P[0-9]+)\.html' IE_NAME = 'Spiegel:Article' IE_DESC = 'Articles on spiegel.de' _TESTS = [{ 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', 'info_dict': { 'id': '1516455', 'ext': 'mp4', 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', 'description': 're:^Patrick Kämnitz gehört.{100,}', }, }, { 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', 'info_dict': { }, 'playlist_count': 6, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) # Single video on top of the page video_link = self._search_regex( r'\s*.*?url\s*=\s*"([^"]+)"', webpage) entries = [ self.url_result(compat_urlparse.urljoin( self.http_scheme() + '//spiegel.de/', embed_path)) for embed_path in embeds ] return self.playlist_result(entries) youtube-dl/youtube_dl/extractor/cliphunter.py0000644000000000000000000000445012641030331020556 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor from ..utils import int_or_none _translation_table = { 'a': 'h', 'd': 'e', 'e': 'v', 'f': 'o', 'g': 'f', 'i': 'd', 'l': 'n', 'm': 'a', 'n': 'm', 'p': 'u', 'q': 't', 'r': 's', 'v': 'p', 'x': 'r', 'y': 'l', 'z': 'i', '$': ':', '&': '.', '(': '=', '^': '&', '=': '/', } def _decode(s): return ''.join(_translation_table.get(c, c) for c in s) class CliphunterIE(InfoExtractor): IE_NAME = 'cliphunter' _VALID_URL = r'''(?x)http://(?:www\.)?cliphunter\.com/w/ (?P[0-9]+)/ (?P.+?)(?:$|[#\?]) ''' _TEST = { 'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', 'md5': 'b7c9bbd4eb3a226ab91093714dcaa480', 'info_dict': { 'id': '1012420', 'ext': 'flv', 'title': 'Fun Jynx Maze solo', 'thumbnail': 're:^https?://.*\.jpg$', 'age_limit': 18, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_title = self._search_regex( r'mediaTitle = "([^"]+)"', webpage, 'title') gexo_files = self._parse_json( self._search_regex( r'var\s+gexoFiles\s*=\s*({.+?});', webpage, 'gexo files'), video_id) formats = [] for format_id, f in gexo_files.items(): video_url = f.get('url') if not video_url: continue fmt = f.get('fmt') height = f.get('h') format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id formats.append({ 'url': _decode(video_url), 'format_id': format_id, 'width': int_or_none(f.get('w')), 'height': int_or_none(height), 'tbr': int_or_none(f.get('br')), }) self._sort_formats(formats) thumbnail = self._search_regex( r"var\s+mov_thumb\s*=\s*'([^']+)';", webpage, 'thumbnail', fatal=False) return { 'id': video_id, 'title': video_title, 'formats': formats, 'age_limit': self._rta_search(webpage), 'thumbnail': thumbnail, } youtube-dl/youtube_dl/extractor/ustream.py0000644000000000000000000001214012650650456020074 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urlparse, ) from ..utils import ( ExtractorError, int_or_none, float_or_none, ) class UstreamIE(InfoExtractor): _VALID_URL = r'https?://www\.ustream\.tv/(?Precorded|embed|embed/recorded)/(?P\d+)' IE_NAME = 'ustream' _TESTS = [{ 'url': 'http://www.ustream.tv/recorded/20274954', 'md5': '088f151799e8f572f84eb62f17d73e5c', 'info_dict': { 'id': '20274954', 'ext': 'flv', 'title': 'Young Americans for Liberty February 7, 2012 2:28 AM', 'description': 'Young Americans for Liberty February 7, 2012 2:28 AM', 'timestamp': 1328577035, 'upload_date': '20120207', 'uploader': 'yaliberty', 'uploader_id': '6780869', }, }, { # From http://sportscanada.tv/canadagames/index.php/week2/figure-skating/444 # Title and uploader available only from params JSON 'url': 'http://www.ustream.tv/embed/recorded/59307601?ub=ff0000&lc=ff0000&oc=ffffff&uc=ffffff&v=3&wmode=direct', 'md5': '5a2abf40babeac9812ed20ae12d34e10', 'info_dict': { 'id': '59307601', 'ext': 'flv', 'title': '-CG11- Canada Games Figure Skating', 'uploader': 'sportscanadatv', }, 'skip': 'This Pro Broadcaster has chosen to remove this video from the ustream.tv site.', }] def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('id') # some sites use this embed format (see: https://github.com/rg3/youtube-dl/issues/2990) if m.group('type') == 'embed/recorded': video_id = m.group('id') desktop_url = 'http://www.ustream.tv/recorded/' + video_id return self.url_result(desktop_url, 'Ustream') if m.group('type') == 'embed': video_id = m.group('id') webpage = self._download_webpage(url, video_id) desktop_video_id = self._html_search_regex( r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id') desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id return self.url_result(desktop_url, 'Ustream') params = self._download_json( 'https://api.ustream.tv/videos/%s.json' % video_id, video_id) error = params.get('error') if error: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, error), expected=True) video = params['video'] title = video['title'] filesize = float_or_none(video.get('file_size')) formats = [{ 'id': video_id, 'url': video_url, 'ext': format_id, 'filesize': filesize, } for format_id, video_url in video['media_urls'].items()] self._sort_formats(formats) description = video.get('description') timestamp = int_or_none(video.get('created_at')) duration = float_or_none(video.get('length')) view_count = int_or_none(video.get('views')) uploader = video.get('owner', {}).get('username') uploader_id = video.get('owner', {}).get('id') thumbnails = [{ 'id': thumbnail_id, 'url': thumbnail_url, } for thumbnail_id, thumbnail_url in video.get('thumbnail', {}).items()] return { 'id': video_id, 'title': title, 'description': description, 'thumbnails': thumbnails, 'timestamp': timestamp, 'duration': duration, 'view_count': view_count, 'uploader': uploader, 'uploader_id': uploader_id, 'formats': formats, } class UstreamChannelIE(InfoExtractor): _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P.+)' IE_NAME = 'ustream:channel' _TEST = { 'url': 'http://www.ustream.tv/channel/channeljapan', 'info_dict': { 'id': '10874166', }, 'playlist_mincount': 17, } def _real_extract(self, url): m = re.match(self._VALID_URL, url) display_id = m.group('slug') webpage = self._download_webpage(url, display_id) channel_id = self._html_search_meta('ustream:channel_id', webpage) BASE = 'http://www.ustream.tv' next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id video_ids = [] while next_url: reply = self._download_json( compat_urlparse.urljoin(BASE, next_url), display_id, note='Downloading video information (next: %d)' % (len(video_ids) + 1)) video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data'])) next_url = reply['nextUrl'] entries = [ self.url_result('http://www.ustream.tv/recorded/' + vid, 'Ustream') for vid in video_ids] return { '_type': 'playlist', 'id': channel_id, 'display_id': display_id, 'entries': entries, } youtube-dl/youtube_dl/extractor/swrmediathek.py0000644000000000000000000000711412641030331021070 0ustar rootroot# -*- coding: utf-8 -*- from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import parse_duration class SWRMediathekIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?swrmediathek\.de/(?:content/)?player\.htm\?show=(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ 'url': 'http://swrmediathek.de/player.htm?show=849790d0-dab8-11e3-a953-0026b975f2e6', 'md5': '8c5f6f0172753368547ca8413a7768ac', 'info_dict': { 'id': '849790d0-dab8-11e3-a953-0026b975f2e6', 'ext': 'mp4', 'title': 'SWR odysso', 'description': 'md5:2012e31baad36162e97ce9eb3f157b8a', 'thumbnail': 're:^http:.*\.jpg$', 'duration': 2602, 'upload_date': '20140515', 'uploader': 'SWR Fernsehen', 'uploader_id': '990030', }, }, { 'url': 'http://swrmediathek.de/player.htm?show=0e1a8510-ddf2-11e3-9be3-0026b975f2e6', 'md5': 'b10ab854f912eecc5a6b55cd6fc1f545', 'info_dict': { 'id': '0e1a8510-ddf2-11e3-9be3-0026b975f2e6', 'ext': 'mp4', 'title': 'Nachtcafé - Alltagsdroge Alkohol - zwischen Sektempfang und Komasaufen', 'description': 'md5:e0a3adc17e47db2c23aab9ebc36dbee2', 'thumbnail': 're:http://.*\.jpg', 'duration': 5305, 'upload_date': '20140516', 'uploader': 'SWR Fernsehen', 'uploader_id': '990030', }, }, { 'url': 'http://swrmediathek.de/player.htm?show=bba23e10-cb93-11e3-bf7f-0026b975f2e6', 'md5': '4382e4ef2c9d7ce6852535fa867a0dd3', 'info_dict': { 'id': 'bba23e10-cb93-11e3-bf7f-0026b975f2e6', 'ext': 'mp3', 'title': 'Saša Stanišic: Vor dem Fest', 'description': 'md5:5b792387dc3fbb171eb709060654e8c9', 'thumbnail': 're:http://.*\.jpg', 'duration': 3366, 'upload_date': '20140520', 'uploader': 'SWR 2', 'uploader_id': '284670', } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') video = self._download_json( 'http://swrmediathek.de/AjaxEntry?ekey=%s' % video_id, video_id, 'Downloading video JSON') attr = video['attr'] media_type = attr['entry_etype'] formats = [] for entry in video['sub']: if entry['name'] != 'entry_media': continue entry_attr = entry['attr'] codec = entry_attr['val0'] quality = int(entry_attr['val1']) fmt = { 'url': entry_attr['val2'], 'quality': quality, } if media_type == 'Video': fmt.update({ 'format_note': ['144p', '288p', '544p', '720p'][quality - 1], 'vcodec': codec, }) elif media_type == 'Audio': fmt.update({ 'acodec': codec, }) formats.append(fmt) self._sort_formats(formats) return { 'id': video_id, 'title': attr['entry_title'], 'description': attr['entry_descl'], 'thumbnail': attr['entry_image_16_9'], 'duration': parse_duration(attr['entry_durat']), 'upload_date': attr['entry_pdatet'][:-4], 'uploader': attr['channel_title'], 'uploader_id': attr['channel_idkey'], 'formats': formats, } youtube-dl/youtube_dl/extractor/iqiyi.py0000644000000000000000000005321112662564617017552 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import hashlib import itertools import math import os import random import re import time import uuid from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_str, compat_urllib_parse, compat_urllib_parse_urlparse, ) from ..utils import ( ExtractorError, ohdave_rsa_encrypt, remove_start, sanitized_Request, urlencode_postdata, url_basename, ) def md5_text(text): return hashlib.md5(text.encode('utf-8')).hexdigest() class IqiyiSDK(object): def __init__(self, target, ip, timestamp): self.target = target self.ip = ip self.timestamp = timestamp @staticmethod def split_sum(data): return compat_str(sum(map(lambda p: int(p, 16), list(data)))) @staticmethod def digit_sum(num): if isinstance(num, int): num = compat_str(num) return compat_str(sum(map(int, num))) def even_odd(self): even = self.digit_sum(compat_str(self.timestamp)[::2]) odd = self.digit_sum(compat_str(self.timestamp)[1::2]) return even, odd def preprocess(self, chunksize): self.target = md5_text(self.target) chunks = [] for i in range(32 // chunksize): chunks.append(self.target[chunksize * i:chunksize * (i + 1)]) if 32 % chunksize: chunks.append(self.target[32 - 32 % chunksize:]) return chunks, list(map(int, self.ip.split('.'))) def mod(self, modulus): chunks, ip = self.preprocess(32) self.target = chunks[0] + ''.join(map(lambda p: compat_str(p % modulus), ip)) def split(self, chunksize): modulus_map = { 4: 256, 5: 10, 8: 100, } chunks, ip = self.preprocess(chunksize) ret = '' for i in range(len(chunks)): ip_part = compat_str(ip[i] % modulus_map[chunksize]) if i < 4 else '' if chunksize == 8: ret += ip_part + chunks[i] else: ret += chunks[i] + ip_part self.target = ret def handle_input16(self): self.target = md5_text(self.target) self.target = self.split_sum(self.target[:16]) + self.target + self.split_sum(self.target[16:]) def handle_input8(self): self.target = md5_text(self.target) ret = '' for i in range(4): part = self.target[8 * i:8 * (i + 1)] ret += self.split_sum(part) + part self.target = ret def handleSum(self): self.target = md5_text(self.target) self.target = self.split_sum(self.target) + self.target def date(self, scheme): self.target = md5_text(self.target) d = time.localtime(self.timestamp) strings = { 'y': compat_str(d.tm_year), 'm': '%02d' % d.tm_mon, 'd': '%02d' % d.tm_mday, } self.target += ''.join(map(lambda c: strings[c], list(scheme))) def split_time_even_odd(self): even, odd = self.even_odd() self.target = odd + md5_text(self.target) + even def split_time_odd_even(self): even, odd = self.even_odd() self.target = even + md5_text(self.target) + odd def split_ip_time_sum(self): chunks, ip = self.preprocess(32) self.target = compat_str(sum(ip)) + chunks[0] + self.digit_sum(self.timestamp) def split_time_ip_sum(self): chunks, ip = self.preprocess(32) self.target = self.digit_sum(self.timestamp) + chunks[0] + compat_str(sum(ip)) class IqiyiSDKInterpreter(object): BASE62_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' def __init__(self, sdk_code): self.sdk_code = sdk_code @classmethod def base62(cls, num): if num == 0: return '0' ret = '' while num: ret = cls.BASE62_TABLE[num % 62] + ret num = num // 62 return ret def decode_eval_codes(self): self.sdk_code = self.sdk_code[5:-3] mobj = re.search( r"'([^']+)',62,(\d+),'([^']+)'\.split\('\|'\),[^,]+,{}", self.sdk_code) obfucasted_code, count, symbols = mobj.groups() count = int(count) symbols = symbols.split('|') symbol_table = {} while count: count -= 1 b62count = self.base62(count) symbol_table[b62count] = symbols[count] or b62count self.sdk_code = re.sub( r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], obfucasted_code) def run(self, target, ip, timestamp): self.decode_eval_codes() functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code) sdk = IqiyiSDK(target, ip, timestamp) other_functions = { 'handleSum': sdk.handleSum, 'handleInput8': sdk.handle_input8, 'handleInput16': sdk.handle_input16, 'splitTimeEvenOdd': sdk.split_time_even_odd, 'splitTimeOddEven': sdk.split_time_odd_even, 'splitIpTimeSum': sdk.split_ip_time_sum, 'splitTimeIpSum': sdk.split_time_ip_sum, } for function in functions: if re.match(r'mod\d+', function): sdk.mod(int(function[3:])) elif re.match(r'date[ymd]{3}', function): sdk.date(function[4:]) elif re.match(r'split\d+', function): sdk.split(int(function[5:])) elif function in other_functions: other_functions[function]() else: raise ExtractorError('Unknown funcion %s' % function) return sdk.target class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html' _NETRC_MACHINE = 'iqiyi' _TESTS = [{ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', 'md5': '2cb594dc2781e6c941a110d8f358118b', 'info_dict': { 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', 'title': '美国德州空中惊现奇异云团 酷似UFO', 'ext': 'f4v', } }, { 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb', 'title': '名侦探柯南第752集', }, 'playlist': [{ 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }], 'params': { 'skip_download': True, }, }, { 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', 'only_matching': True, }, { 'url': 'http://www.iqiyi.com/a_19rrhbc6kt.html', 'only_matching': True, }, { 'url': 'http://yule.iqiyi.com/pcb.html', 'only_matching': True, }, { # VIP-only video. The first 2 parts (6 minutes) are available without login # MD5 sums omitted as values are different on Travis CI and my machine 'url': 'http://www.iqiyi.com/v_19rrny4w8w.html', 'info_dict': { 'id': 'f3cf468b39dddb30d676f89a91200dc1', 'title': '泰坦尼克号', }, 'playlist': [{ 'info_dict': { 'id': 'f3cf468b39dddb30d676f89a91200dc1_part1', 'ext': 'f4v', 'title': '泰坦尼克号', }, }, { 'info_dict': { 'id': 'f3cf468b39dddb30d676f89a91200dc1_part2', 'ext': 'f4v', 'title': '泰坦尼克号', }, }], 'expected_warnings': ['Needs a VIP account for full video'], }, { 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html', 'info_dict': { 'id': '202918101', 'title': '灌篮高手 国语版', }, 'playlist_count': 101, }] _FORMATS_MAP = [ ('1', 'h6'), ('2', 'h5'), ('3', 'h4'), ('4', 'h3'), ('5', 'h2'), ('10', 'h1'), ] def _real_initialize(self): self._login() @staticmethod def _rsa_fun(data): # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd e = 65537 return ohdave_rsa_encrypt(data, e, N) def _login(self): (username, password) = self._get_login_info() # No authentication to be performed if not username: return True data = self._download_json( 'http://kylin.iqiyi.com/get_token', None, note='Get token for logging', errnote='Unable to get token for logging') sdk = data['sdk'] timestamp = int(time.time()) target = '/apis/reglogin/login.action?lang=zh_TW&area_code=null&email=%s&passwd=%s&agenttype=1&from=undefined&keeplogin=0&piccode=&fromurl=&_pos=1' % ( username, self._rsa_fun(password.encode('utf-8'))) interp = IqiyiSDKInterpreter(sdk) sign = interp.run(target, data['ip'], timestamp) validation_params = { 'target': target, 'server': 'BEA3AA1908656AABCCFF76582C4C6660', 'token': data['token'], 'bird_src': 'f8d91d57af224da7893dd397d52d811a', 'sign': sign, 'bird_t': timestamp, } validation_result = self._download_json( 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None, note='Validate credentials', errnote='Unable to validate credentials') MSG_MAP = { 'P00107': 'please login via the web interface and enter the CAPTCHA code', 'P00117': 'bad username or password', } code = validation_result['code'] if code != 'A00000': msg = MSG_MAP.get(code) if not msg: msg = 'error %s' % code if validation_result.get('msg'): msg += ': ' + validation_result['msg'] self._downloader.report_warning('unable to log in: ' + msg) return False return True def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning): auth_params = { # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as 'version': '2.0', 'platform': 'b6c13e26323c537d', 'aid': tvid, 'tvid': tvid, 'uid': '', 'deviceId': _uuid, 'playType': 'main', # XXX: always main? 'filename': os.path.splitext(url_basename(api_video_url))[0], } qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query) for key, val in qd_items.items(): auth_params[key] = val[0] auth_req = sanitized_Request( 'http://api.vip.iqiyi.com/services/ckn.action', urlencode_postdata(auth_params)) # iQiyi server throws HTTP 405 error without the following header auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded') auth_result = self._download_json( auth_req, video_id, note='Downloading video authentication JSON', errnote='Unable to download video authentication JSON') if auth_result['code'] == 'Q00506': # requires a VIP account if do_report_warning: self.report_warning('Needs a VIP account for full video') return False return auth_result def construct_video_urls(self, data, video_id, _uuid, tvid): def do_xor(x, y): a = y % 3 if a == 1: return x ^ 121 if a == 2: return x ^ 72 return x ^ 103 def get_encode_code(l): a = 0 b = l.split('-') c = len(b) s = '' for i in range(c - 1, -1, -1): a = do_xor(int(b[c - i - 1], 16), i) s += chr(a) return s[::-1] def get_path_key(x, format_id, segment_index): mg = ')(*&^flash@#$%a' tm = self._download_json( 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id, note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) )['t'] t = str(int(math.floor(int(tm) / (600.0)))) return md5_text(t + mg + x) video_urls_dict = {} need_vip_warning_report = True for format_item in data['vp']['tkl'][0]['vs']: if 0 < int(format_item['bid']) <= 10: format_id = self.get_format(format_item['bid']) else: continue video_urls = [] video_urls_info = format_item['fs'] if not format_item['fs'][0]['l'].startswith('/'): t = get_encode_code(format_item['fs'][0]['l']) if t.endswith('mp4'): video_urls_info = format_item['flvs'] for segment_index, segment in enumerate(video_urls_info): vl = segment['l'] if not vl.startswith('/'): vl = get_encode_code(vl) is_vip_video = '/vip/' in vl filesize = segment['b'] base_url = data['vp']['du'].split('/') if not is_vip_video: key = get_path_key( vl.split('/')[-1].split('.')[0], format_id, segment_index) base_url.insert(-1, key) base_url = '/'.join(base_url) param = { 'su': _uuid, 'qyid': uuid.uuid4().hex, 'client': '', 'z': '', 'bt': '', 'ct': '', 'tn': str(int(time.time())) } api_video_url = base_url + vl if is_vip_video: api_video_url = api_video_url.replace('.f4v', '.hml') auth_result = self._authenticate_vip_video( api_video_url, video_id, tvid, _uuid, need_vip_warning_report) if auth_result is False: need_vip_warning_report = False break param.update({ 't': auth_result['data']['t'], # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as 'cid': 'afbe8fd3d73448c9', 'vid': video_id, 'QY00001': auth_result['data']['u'], }) api_video_url += '?' if '?' not in api_video_url else '&' api_video_url += compat_urllib_parse.urlencode(param) js = self._download_json( api_video_url, video_id, note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) video_url = js['l'] video_urls.append( (video_url, filesize)) video_urls_dict[format_id] = video_urls return video_urls_dict def get_format(self, bid): matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)] return matched_format_ids[0] if len(matched_format_ids) else None def get_bid(self, format_id): matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id] return matched_bids[0] if len(matched_bids) else None def get_raw_data(self, tvid, video_id, enc_key, _uuid): tm = str(int(time.time())) tail = tm + tvid param = { 'key': 'fvip', 'src': md5_text('youtube-dl'), 'tvId': tvid, 'vid': video_id, 'vinfo': 1, 'tm': tm, 'enc': md5_text(enc_key + tail), 'qyid': _uuid, 'tn': random.random(), 'um': 0, 'authkey': md5_text(md5_text('') + tail), 'k_tag': 1, } api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ compat_urllib_parse.urlencode(param) raw_data = self._download_json(api_url, video_id) return raw_data def get_enc_key(self, swf_url, video_id): # TODO: automatic key extraction # last update at 2016-01-22 for Zombie::bite enc_key = '6ab6d0280511493ba85594779759d4ed' return enc_key def _extract_playlist(self, webpage): PAGE_SIZE = 50 links = re.findall( r']+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"', webpage) if not links: return album_id = self._search_regex( r'albumId\s*:\s*(\d+),', webpage, 'album ID') album_title = self._search_regex( r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False) entries = list(map(self.url_result, links)) # Start from 2 because links in the first page are already on webpage for page_num in itertools.count(2): pagelist_page = self._download_webpage( 'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE), album_id, note='Download playlist page %d' % page_num, errnote='Failed to download playlist page %d' % page_num) pagelist = self._parse_json( remove_start(pagelist_page, 'var tvInfoJs='), album_id) vlist = pagelist['data']['vlist'] for item in vlist: entries.append(self.url_result(item['vurl'])) if len(vlist) < PAGE_SIZE: break return self.playlist_result(entries, album_id, album_title) def _real_extract(self, url): webpage = self._download_webpage( url, 'temp_id', note='download video page') # There's no simple way to determine whether an URL is a playlist or not # So detect it playlist_result = self._extract_playlist(webpage) if playlist_result: return playlist_result tvid = self._search_regex( r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') video_id = self._search_regex( r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') swf_url = self._search_regex( r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL') _uuid = uuid.uuid4().hex enc_key = self.get_enc_key(swf_url, video_id) raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) if raw_data['code'] != 'A000000': raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) data = raw_data['data'] title = data['vi']['vn'] # generate video_urls_dict video_urls_dict = self.construct_video_urls( data, video_id, _uuid, tvid) # construct info entries = [] for format_id in video_urls_dict: video_urls = video_urls_dict[format_id] for i, video_url_info in enumerate(video_urls): if len(entries) < i + 1: entries.append({'formats': []}) entries[i]['formats'].append( { 'url': video_url_info[0], 'filesize': video_url_info[-1], 'format_id': format_id, 'preference': int(self.get_bid(format_id)) } ) for i in range(len(entries)): self._sort_formats(entries[i]['formats']) entries[i].update( { 'id': '%s_part%d' % (video_id, i + 1), 'title': title, } ) if len(entries) > 1: info = { '_type': 'multi_video', 'id': video_id, 'title': title, 'entries': entries, } else: info = entries[0] info['id'] = video_id info['title'] = title return info youtube-dl/youtube_dl/extractor/eighttracks.py0000644000000000000000000001335412660177411020730 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import json import random from .common import InfoExtractor from ..compat import ( compat_str, ) from ..utils import ( ExtractorError, ) class EightTracksIE(InfoExtractor): IE_NAME = '8tracks' _VALID_URL = r'https?://8tracks\.com/(?P[^/]+)/(?P[^/#]+)(?:#.*)?$' _TEST = { 'name': 'EightTracks', 'url': 'http://8tracks.com/ytdl/youtube-dl-test-tracks-a', 'info_dict': { 'id': '1336550', 'display_id': 'youtube-dl-test-tracks-a', 'description': "test chars: \"'/\\ä↭", 'title': "youtube-dl test tracks \"'/\\ä↭<>", }, 'playlist': [ { 'md5': '96ce57f24389fc8734ce47f4c1abcc55', 'info_dict': { 'id': '11885610', 'ext': 'm4a', 'title': "youtue-dl project<>\"' - youtube-dl test track 1 \"'/\\\u00e4\u21ad", 'uploader_id': 'ytdl' } }, { 'md5': '4ab26f05c1f7291ea460a3920be8021f', 'info_dict': { 'id': '11885608', 'ext': 'm4a', 'title': "youtube-dl project - youtube-dl test track 2 \"'/\\\u00e4\u21ad", 'uploader_id': 'ytdl' } }, { 'md5': 'd30b5b5f74217410f4689605c35d1fd7', 'info_dict': { 'id': '11885679', 'ext': 'm4a', 'title': "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad", 'uploader_id': 'ytdl' } }, { 'md5': '4eb0a669317cd725f6bbd336a29f923a', 'info_dict': { 'id': '11885680', 'ext': 'm4a', 'title': "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad", 'uploader_id': 'ytdl' } }, { 'md5': '1893e872e263a2705558d1d319ad19e8', 'info_dict': { 'id': '11885682', 'ext': 'm4a', 'title': "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad", 'uploader_id': 'ytdl' } }, { 'md5': 'b673c46f47a216ab1741ae8836af5899', 'info_dict': { 'id': '11885683', 'ext': 'm4a', 'title': "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad", 'uploader_id': 'ytdl' } }, { 'md5': '1d74534e95df54986da7f5abf7d842b7', 'info_dict': { 'id': '11885684', 'ext': 'm4a', 'title': "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad", 'uploader_id': 'ytdl' } }, { 'md5': 'f081f47af8f6ae782ed131d38b9cd1c0', 'info_dict': { 'id': '11885685', 'ext': 'm4a', 'title': "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad", 'uploader_id': 'ytdl' } } ] } def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) data = self._parse_json( self._search_regex( r"(?s)PAGE\.mix\s*=\s*({.+?});\n", webpage, 'trax information'), playlist_id) session = str(random.randint(0, 1000000000)) mix_id = data['id'] track_count = data['tracks_count'] duration = data['duration'] avg_song_duration = float(duration) / track_count # duration is sometimes negative, use predefined avg duration if avg_song_duration <= 0: avg_song_duration = 300 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) next_url = first_url entries = [] for i in range(track_count): api_json = None download_tries = 0 while api_json is None: try: api_json = self._download_webpage( next_url, playlist_id, note='Downloading song information %d/%d' % (i + 1, track_count), errnote='Failed to download song information') except ExtractorError: if download_tries > 3: raise else: download_tries += 1 self._sleep(avg_song_duration, playlist_id) api_data = json.loads(api_json) track_data = api_data['set']['track'] info = { 'id': compat_str(track_data['id']), 'url': track_data['track_file_stream_url'], 'title': track_data['performer'] + ' - ' + track_data['name'], 'raw_title': track_data['name'], 'uploader_id': data['user']['login'], 'ext': 'm4a', } entries.append(info) next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % ( session, mix_id, track_data['id']) return { '_type': 'playlist', 'entries': entries, 'id': compat_str(mix_id), 'display_id': playlist_id, 'title': data.get('name'), 'description': data.get('description'), } youtube-dl/youtube_dl/extractor/screenjunkies.py0000644000000000000000000001221612662061715021265 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, parse_age_limit, ) class ScreenJunkiesIE(InfoExtractor): _VALID_URL = r'http://www.screenjunkies.com/video/(?P[^/]+?)(?:-(?P\d+))?(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915', 'md5': '5c2b686bec3d43de42bde9ec047536b0', 'info_dict': { 'id': '2841915', 'display_id': 'best-quentin-tarantino-movie', 'ext': 'mp4', 'title': 'Best Quentin Tarantino Movie', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 3671, 'age_limit': 13, 'tags': list, }, }, { 'url': 'http://www.screenjunkies.com/video/honest-trailers-the-dark-knight', 'info_dict': { 'id': '2348808', 'display_id': 'honest-trailers-the-dark-knight', 'ext': 'mp4', 'title': "Honest Trailers: 'The Dark Knight'", 'thumbnail': 're:^https?://.*\.jpg', 'age_limit': 10, 'tags': list, }, }, { # requires subscription but worked around 'url': 'http://www.screenjunkies.com/video/knocking-dead-ep-1-the-show-so-far-3003285', 'info_dict': { 'id': '3003285', 'display_id': 'knocking-dead-ep-1-the-show-so-far', 'ext': 'mp4', 'title': 'Knocking Dead Ep 1: State of The Dead Recap', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 3307, 'age_limit': 13, 'tags': list, }, }] _DEFAULT_BITRATES = (48, 150, 496, 864, 2240) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') if not video_id: webpage = self._download_webpage(url, display_id) video_id = self._search_regex( (r'src=["\']/embed/(\d+)', r'data-video-content-id=["\'](\d+)'), webpage, 'video id') webpage = self._download_webpage( 'http://www.screenjunkies.com/embed/%s' % video_id, display_id, 'Downloading video embed page') embed_vars = self._parse_json( self._search_regex( r'(?s)embedVars\s*=\s*({.+?})\s*', webpage, 'embed vars'), display_id) title = embed_vars['contentName'] formats = [] bitrates = [] for f in embed_vars.get('media', []): if not f.get('uri') or f.get('mediaPurpose') != 'play': continue bitrate = int_or_none(f.get('bitRate')) if bitrate: bitrates.append(bitrate) formats.append({ 'url': f['uri'], 'format_id': 'http-%d' % bitrate if bitrate else 'http', 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'tbr': bitrate, 'format': 'mp4', }) if not bitrates: # When subscriptionLevel > 0, i.e. plus subscription is required # media list will be empty. However, hds and hls uris are still # available. We can grab them assuming bitrates to be default. bitrates = self._DEFAULT_BITRATES auth_token = embed_vars.get('AuthToken') def construct_manifest_url(base_url, ext): pieces = [base_url] pieces.extend([compat_str(b) for b in bitrates]) pieces.append('_kbps.mp4.%s?%s' % (ext, auth_token)) return ','.join(pieces) if bitrates and auth_token: hds_url = embed_vars.get('hdsUri') if hds_url: f4m_formats = self._extract_f4m_formats( construct_manifest_url(hds_url, 'f4m'), display_id, f4m_id='hds', fatal=False) if len(f4m_formats) == len(bitrates): for f, bitrate in zip(f4m_formats, bitrates): if not f.get('tbr'): f['format_id'] = 'hds-%d' % bitrate f['tbr'] = bitrate # TODO: fix f4m downloader to handle manifests without bitrates if possible # formats.extend(f4m_formats) hls_url = embed_vars.get('hlsUri') if hls_url: formats.extend(self._extract_m3u8_formats( construct_manifest_url(hls_url, 'm3u8'), display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': title, 'thumbnail': embed_vars.get('thumbUri'), 'duration': int_or_none(embed_vars.get('videoLengthInSeconds')) or None, 'age_limit': parse_age_limit(embed_vars.get('audienceRating')), 'tags': embed_vars.get('tags', '').split(','), 'formats': formats, } youtube-dl/youtube_dl/extractor/gameone.py0000644000000000000000000001064712641030331020021 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( xpath_with_ns, parse_iso8601, float_or_none, int_or_none, ) NAMESPACE_MAP = { 'media': 'http://search.yahoo.com/mrss/', } # URL prefix to download the mp4 files directly instead of streaming via rtmp # Credits go to XBox-Maniac # http://board.jdownloader.org/showpost.php?p=185835&postcount=31 RAW_MP4_URL = 'http://cdn.riptide-mtvn.com/' class GameOneIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gameone\.de/tv/(?P\d+)' _TESTS = [ { 'url': 'http://www.gameone.de/tv/288', 'md5': '136656b7fb4c9cb4a8e2d500651c499b', 'info_dict': { 'id': '288', 'ext': 'mp4', 'title': 'Game One - Folge 288', 'duration': 1238, 'thumbnail': 'http://s3.gameone.de/gameone/assets/video_metas/teaser_images/000/643/636/big/640x360.jpg', 'description': 'FIFA-Pressepokal 2014, Star Citizen, Kingdom Come: Deliverance, Project Cars, Schöner Trants Nerdquiz Folge 2 Runde 1', 'age_limit': 16, 'upload_date': '20140513', 'timestamp': 1399980122, } }, { 'url': 'http://gameone.de/tv/220', 'md5': '5227ca74c4ae6b5f74c0510a7c48839e', 'info_dict': { 'id': '220', 'ext': 'mp4', 'upload_date': '20120918', 'description': 'Jet Set Radio HD, Tekken Tag Tournament 2, Source Filmmaker', 'timestamp': 1347971451, 'title': 'Game One - Folge 220', 'duration': 896.62, 'age_limit': 16, } } ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) og_video = self._og_search_video_url(webpage, secure=False) description = self._html_search_meta('description', webpage) age_limit = int( self._search_regex( r'age=(\d+)', self._html_search_meta( 'age-de-meta-label', webpage), 'age_limit', '0')) mrss_url = self._search_regex(r'mrss=([^&]+)', og_video, 'mrss') mrss = self._download_xml(mrss_url, video_id, 'Downloading mrss') title = mrss.find('.//item/title').text thumbnail = mrss.find('.//item/image').get('url') timestamp = parse_iso8601(mrss.find('.//pubDate').text, delimiter=' ') content = mrss.find(xpath_with_ns('.//media:content', NAMESPACE_MAP)) content_url = content.get('url') content = self._download_xml( content_url, video_id, 'Downloading media:content') rendition_items = content.findall('.//rendition') duration = float_or_none(rendition_items[0].get('duration')) formats = [ { 'url': re.sub(r'.*/(r2)', RAW_MP4_URL + r'\1', r.find('./src').text), 'width': int_or_none(r.get('width')), 'height': int_or_none(r.get('height')), 'tbr': int_or_none(r.get('bitrate')), } for r in rendition_items ] self._sort_formats(formats) return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, 'description': description, 'age_limit': age_limit, 'timestamp': timestamp, } class GameOnePlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gameone\.de(?:/tv)?/?$' IE_NAME = 'gameone:playlist' _TEST = { 'url': 'http://www.gameone.de/tv', 'info_dict': { 'title': 'GameOne', }, 'playlist_mincount': 294, } def _real_extract(self, url): webpage = self._download_webpage('http://www.gameone.de/tv', 'TV') max_id = max(map(int, re.findall(r'[A-Za-z0-9-]+)' EMBED_PATTERN = r'src=(["\'])(?P(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1' _TESTS = [{ 'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) playerdata = self._download_webpage( 'http://player.screenwavemedia.com/player.php?id=%s' % video_id, video_id, 'Downloading player webpage') vidtitle = self._search_regex( r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/') playerconfig = self._download_webpage( 'http://player.screenwavemedia.com/player.js', video_id, 'Downloading playerconfig webpage') videoserver = self._search_regex(r'SWMServer\s*=\s*"([\d\.]+)"', playerdata, 'videoserver') sources = self._parse_json( js_to_json( re.sub( r'(?s)/\*.*?\*/', '', self._search_regex( r'sources\s*:\s*(\[[^\]]+?\])', playerconfig, 'sources', ).replace( "' + thisObj.options.videoserver + '", videoserver ).replace( "' + playerVidId + '", video_id ) ) ), video_id, fatal=False ) # Fallback to hardcoded sources if JS changes again if not sources: self.report_warning('Falling back to a hardcoded list of streams') sources = [{ 'file': 'http://%s/vod/%s_%s.mp4' % (videoserver, video_id, format_id), 'type': 'mp4', 'label': format_label, } for format_id, format_label in ( ('low', '144p Low'), ('med', '160p Med'), ('high', '360p High'), ('hd1', '720p HD1'))] sources.append({ 'file': 'http://%s/vod/smil:%s.smil/playlist.m3u8' % (videoserver, video_id), 'type': 'hls', }) formats = [] for source in sources: if source['type'] == 'hls': formats.extend(self._extract_m3u8_formats(source['file'], video_id, ext='mp4')) else: file_ = source.get('file') if not file_: continue format_label = source.get('label') format_id = self._search_regex( r'_(.+?)\.[^.]+$', file_, 'format id', default=None) height = int_or_none(self._search_regex( r'^(\d+)[pP]', format_label, 'height', default=None)) formats.append({ 'url': source['file'], 'format_id': format_id, 'format': format_label, 'ext': source.get('type'), 'height': height, }) self._sort_formats(formats) return { 'id': video_id, 'title': vidtitle, 'formats': formats, } class TeamFourIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/video/(?P[a-z0-9\-]+)/?' _TEST = { 'url': 'http://teamfourstar.com/video/a-moment-with-tfs-episode-4/', 'info_dict': { 'id': 'TeamFourStar-5292a02f20bfa', 'ext': 'mp4', 'upload_date': '20130401', 'description': 'Check out this and more on our website: http://teamfourstar.com\nTFS Store: http://sharkrobot.com/team-four-star\nFollow on Twitter: http://twitter.com/teamfourstar\nLike on FB: http://facebook.com/teamfourstar', 'title': 'A Moment With TFS Episode 4', }, 'params': { # m3u8 download 'skip_download': True, }, } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) playerdata_url = self._search_regex( r'src="(http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', webpage, 'player data URL') video_title = self._html_search_regex( r'
    (?P.+?)</div>', webpage, 'title') video_date = unified_strdate(self._html_search_regex( r'<div class="heroheadingdate">(?P<date>.+?)</div>', webpage, 'date', fatal=False)) video_description = self._html_search_regex( r'(?s)<div class="postcontent">(?P<description>.+?)</div>', webpage, 'description', fatal=False) video_thumbnail = self._og_search_thumbnail(webpage) return { '_type': 'url_transparent', 'display_id': display_id, 'title': video_title, 'description': video_description, 'upload_date': video_date, 'thumbnail': video_thumbnail, 'url': playerdata_url, } �������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/theonion.py���������������������������������������������������������0000644�0000000�0000000�00000004142�12641030331�020222� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class TheOnionIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<id>[0-9]+)/?' _TEST = { 'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/', 'md5': '19eaa9a39cf9b9804d982e654dc791ee', 'info_dict': { 'id': '2133', 'ext': 'mp4', 'title': 'Man Wearing M&M Jacket Apparently Made In God\'s Image', 'description': 'md5:cc12448686b5600baae9261d3e180910', 'thumbnail': 're:^https?://.*\.jpg\?\d+$', } } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex( r'"videoId":\s(\d+),', webpage, 'video ID') title = self._og_search_title(webpage) description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) sources = re.findall(r'<source src="([^"]+)" type="([^"]+)"', webpage) formats = [] for src, type_ in sources: if type_ == 'video/mp4': formats.append({ 'format_id': 'mp4_sd', 'preference': 1, 'url': src, }) elif type_ == 'video/webm': formats.append({ 'format_id': 'webm_sd', 'preference': 0, 'url': src, }) elif type_ == 'application/x-mpegURL': formats.extend( self._extract_m3u8_formats(src, display_id, preference=-1)) else: self.report_warning( 'Encountered unexpected format: %s' % type_) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'description': description, } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/screencastomatic.py�������������������������������������������������0000644�0000000�0000000�00000003261�12641030331�021727� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( ExtractorError, js_to_json, ) class ScreencastOMaticIE(InfoExtractor): _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P<id>[0-9a-zA-Z]+)' _TEST = { 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', 'md5': '483583cb80d92588f15ccbedd90f0c18', 'info_dict': { 'id': 'c2lD3BeOPl', 'ext': 'mp4', 'title': 'Welcome to 3-4 Philosophy @ DECV!', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) setup_js = self._search_regex( r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code') data = self._parse_json(setup_js, video_id, transform_source=js_to_json) try: video_data = next( m for m in data['modes'] if m.get('type') == 'html5') except StopIteration: raise ExtractorError('Could not find any video entries!') video_url = compat_urlparse.urljoin(url, video_data['config']['file']) thumbnail = data.get('image') return { 'id': video_id, 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'url': video_url, 'ext': 'mp4', 'thumbnail': thumbnail, } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/sbs.py��������������������������������������������������������������0000644�0000000�0000000�00000003513�12641030331�017167� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# -*- coding: utf-8 -*- from __future__ import unicode_literals from .common import InfoExtractor class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P<id>[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: # http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation 'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed', 'md5': '3150cf278965eeabb5b4cea1c963fe0a', 'info_dict': { 'id': '320403011771', 'ext': 'mp4', 'title': 'Dingo Conservation (The Feed)', 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', 'thumbnail': 're:http://.*\.jpg', 'duration': 308, }, }, { 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', 'only_matching': True, }, { 'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id) player_params = self._parse_json( self._search_regex( r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'), video_id) urls = player_params['releaseUrls'] theplatform_url = (urls.get('progressive') or urls.get('standard') or urls.get('html') or player_params['relatedItemsURL']) return { '_type': 'url_transparent', 'id': video_id, 'url': theplatform_url, } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/expotv.py�����������������������������������������������������������0000644�0000000�0000000�00000005621�12641030331�017727� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, ) class ExpoTVIE(InfoExtractor): _VALID_URL = r'https?://www\.expotv\.com/videos/[^?#]*/(?P<id>[0-9]+)($|[?#])' _TEST = { 'url': 'http://www.expotv.com/videos/reviews/1/24/LinneCardscom/17561', 'md5': '2985e6d7a392b2f7a05e0ca350fe41d0', 'info_dict': { 'id': '17561', 'ext': 'mp4', 'upload_date': '20060212', 'title': 'My Favorite Online Scrapbook Store', 'view_count': int, 'description': 'You\'ll find most everything you need at this virtual store front.', 'uploader': 'Anna T.', 'thumbnail': 're:^https?://.*\.jpg$', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) player_key = self._search_regex( r'<param name="playerKey" value="([^"]+)"', webpage, 'player key') config = self._download_json( 'http://client.expotv.com/video/config/%s/%s' % (video_id, player_key), video_id, 'Downloading video configuration') formats = [] for fcfg in config['sources']: media_url = fcfg.get('file') if not media_url: continue if fcfg.get('type') == 'm3u8': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')) else: formats.append({ 'url': media_url, 'height': int_or_none(fcfg.get('height')), 'format_id': fcfg.get('label'), 'ext': self._search_regex( r'filename=.*\.([a-z0-9_A-Z]+)&', media_url, 'file extension', default=None) or fcfg.get('type'), }) self._sort_formats(formats) title = self._og_search_title(webpage) description = self._og_search_description(webpage) thumbnail = config.get('image') view_count = int_or_none(self._search_regex( r'<h5>Plays: ([0-9]+)</h5>', webpage, 'view counts')) uploader = self._search_regex( r'<div class="reviewer">\s*<img alt="([^"]+)"', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._search_regex( r'<h5>Reviewed on ([0-9/.]+)</h5>', webpage, 'upload date', fatal=False)) return { 'id': video_id, 'formats': formats, 'title': title, 'description': description, 'view_count': view_count, 'thumbnail': thumbnail, 'uploader': uploader, 'upload_date': upload_date, } ���������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/muzu.py�������������������������������������������������������������0000644�0000000�0000000�00000004260�12641030331�017400� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_urllib_parse, ) class MuzuTVIE(InfoExtractor): _VALID_URL = r'https?://www\.muzu\.tv/(.+?)/(.+?)/(?P<id>\d+)' IE_NAME = 'muzu.tv' _TEST = { 'url': 'http://www.muzu.tv/defected/marcashken-featuring-sos-cat-walk-original-mix-music-video/1981454/', 'md5': '98f8b2c7bc50578d6a0364fff2bfb000', 'info_dict': { 'id': '1981454', 'ext': 'mp4', 'title': 'Cat Walk (Original Mix)', 'description': 'md5:90e868994de201b2570e4e5854e19420', 'uploader': 'MarcAshken featuring SOS', }, } def _real_extract(self, url): video_id = self._match_id(url) info_data = compat_urllib_parse.urlencode({ 'format': 'json', 'url': url, }) info = self._download_json( 'http://www.muzu.tv/api/oembed/?%s' % info_data, video_id, 'Downloading video info') player_info = self._download_json( 'http://player.muzu.tv/player/playerInit?ai=%s' % video_id, video_id, 'Downloading player info') video_info = player_info['videos'][0] for quality in ['1080', '720', '480', '360']: if video_info.get('v%s' % quality): break data = compat_urllib_parse.urlencode({ 'ai': video_id, # Even if each time you watch a video the hash changes, # it seems to work for different videos, and it will work # even if you use any non empty string as a hash 'viewhash': 'VBNff6djeV4HV5TRPW5kOHub2k', 'device': 'web', 'qv': quality, }) video_url_info = self._download_json( 'http://player.muzu.tv/player/requestVideo?%s' % data, video_id, 'Downloading video url') video_url = video_url_info['url'] return { 'id': video_id, 'title': info['title'], 'url': video_url, 'thumbnail': info['thumbnail_url'], 'description': info['description'], 'uploader': info['author_name'], } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/r7.py���������������������������������������������������������������0000644�0000000�0000000�00000007132�12641030331�016731� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( js_to_json, unescapeHTML, int_or_none, ) class R7IE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?: (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| noticias\.r7\.com(?:/[^/]+)+/[^/]+-| player\.r7\.com/video/i/ ) (?P<id>[\da-f]{24}) ''' _TESTS = [{ 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', 'md5': '403c4e393617e8e8ddc748978ee8efde', 'info_dict': { 'id': '54e7050b0cf2ff57e0279389', 'ext': 'mp4', 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 98, 'like_count': int, 'view_count': int, }, }, { 'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html', 'only_matching': True, }, { 'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/', 'only_matching': True, }, { 'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://player.r7.com/video/i/%s' % video_id, video_id) item = self._parse_json(js_to_json(self._search_regex( r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id) title = unescapeHTML(item['title']) thumbnail = item.get('init', {}).get('thumbUri') duration = None statistics = item.get('statistics', {}) like_count = int_or_none(statistics.get('likes')) view_count = int_or_none(statistics.get('views')) formats = [] for format_key, format_dict in item['playlist'][0].items(): src = format_dict.get('src') if not src: continue format_id = format_dict.get('format') or format_key if duration is None: duration = format_dict.get('duration') if '.f4m' in src: formats.extend(self._extract_f4m_formats(src, video_id, preference=-1)) elif src.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2)) else: formats.append({ 'url': src, 'format_id': format_id, }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'like_count': like_count, 'view_count': view_count, 'formats': formats, } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/esri.py�������������������������������������������������������������0000644�0000000�0000000�00000005103�12641030331�017337� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( int_or_none, parse_filesize, unified_strdate, ) class EsriVideoIE(InfoExtractor): _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)' _TEST = { 'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications', 'md5': 'd4aaf1408b221f1b38227a9bbaeb95bc', 'info_dict': { 'id': '1124', 'ext': 'mp4', 'title': 'ArcGIS Online - Developing Applications', 'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 185, 'upload_date': '20120419', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) formats = [] for width, height, content in re.findall( r'(?s)<li><strong>(\d+)x(\d+):</strong>(.+?)</li>', webpage): for video_url, ext, filesize in re.findall( r'<a[^>]+href="([^"]+)">([^<]+) \(([^<]+)\)</a>', content): formats.append({ 'url': compat_urlparse.urljoin(url, video_url), 'ext': ext.lower(), 'format_id': '%s-%s' % (ext.lower(), height), 'width': int(width), 'height': int(height), 'filesize_approx': parse_filesize(filesize), }) self._sort_formats(formats) title = self._html_search_meta('title', webpage, 'title') description = self._html_search_meta( 'description', webpage, 'description', fatal=False) thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail', fatal=False) if thumbnail: thumbnail = re.sub(r'_[st]\.jpg$', '_x.jpg', thumbnail) duration = int_or_none(self._search_regex( [r'var\s+videoSeconds\s*=\s*(\d+)', r"'duration'\s*:\s*(\d+)"], webpage, 'duration', fatal=False)) upload_date = unified_strdate(self._html_search_meta( 'last-modified', webpage, 'upload date', fatal=False)) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'upload_date': upload_date, 'formats': formats } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/moviezine.py��������������������������������������������������������0000644�0000000�0000000�00000002572�12641030331�020411� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# -*- coding: utf-8 -*- from __future__ import unicode_literals import re from .common import InfoExtractor class MoviezineIE(InfoExtractor): _VALID_URL = r'https?://www\.moviezine\.se/video/(?P<id>[^?#]+)' _TEST = { 'url': 'http://www.moviezine.se/video/205866', 'info_dict': { 'id': '205866', 'ext': 'mp4', 'title': 'Oculus - Trailer 1', 'description': 'md5:40cc6790fc81d931850ca9249b40e8a4', 'thumbnail': 're:http://.*\.jpg', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) jsplayer = self._download_webpage('http://www.moviezine.se/api/player.js?video=%s' % video_id, video_id, 'Downloading js api player') formats = [{ 'format_id': 'sd', 'url': self._html_search_regex(r'file: "(.+?)",', jsplayer, 'file'), 'quality': 0, 'ext': 'mp4', }] self._sort_formats(formats) return { 'id': video_id, 'title': self._search_regex(r'title: "(.+?)",', jsplayer, 'title'), 'thumbnail': self._search_regex(r'image: "(.+?)",', jsplayer, 'image'), 'formats': formats, 'description': self._og_search_description(webpage), } ��������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/behindkink.py�������������������������������������������������������0000644�0000000�0000000�00000003155�12641030331�020510� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import url_basename class BehindKinkIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' _TEST = { 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', 'md5': '507b57d8fdcd75a41a9a7bdb7989c762', 'info_dict': { 'id': '37127', 'ext': 'mp4', 'title': 'What are you passionate about – Marley Blaze', 'description': 'md5:aee8e9611b4ff70186f752975d9b94b4', 'upload_date': '20141205', 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/12/blaze-1.jpg', 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') webpage = self._download_webpage(url, display_id) video_url = self._search_regex( r'<source src="([^"]+)"', webpage, 'video URL') video_id = url_basename(video_url).split('_')[0] upload_date = mobj.group('year') + mobj.group('month') + mobj.group('day') return { 'id': video_id, 'display_id': display_id, 'url': video_url, 'title': self._og_search_title(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), 'upload_date': upload_date, 'age_limit': 18, } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/ku6.py��������������������������������������������������������������0000644�0000000�0000000�00000001736�12641030331�017112� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor class Ku6IE(InfoExtractor): _VALID_URL = r'http://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html' _TEST = { 'url': 'http://v.ku6.com/show/JG-8yS14xzBr4bCn1pu0xw...html', 'md5': '01203549b9efbb45f4b87d55bdea1ed1', 'info_dict': { 'id': 'JG-8yS14xzBr4bCn1pu0xw', 'ext': 'f4v', 'title': 'techniques test', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( r'<h1 title=.*>(.*?)</h1>', webpage, 'title') dataUrl = 'http://v.ku6.com/fetchVideo4Player/%s.html' % video_id jsonData = self._download_json(dataUrl, video_id) downloadUrl = jsonData['data']['f'] return { 'id': video_id, 'title': title, 'url': downloadUrl } ����������������������������������youtube-dl/youtube_dl/extractor/xvideos.py����������������������������������������������������������0000644�0000000�0000000�00000004774�12641030331�020073� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( clean_html, ExtractorError, determine_ext, sanitized_Request, ) class XVideosIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?xvideos\.com/video(?P<id>[0-9]+)(?:.*)' _TEST = { 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', 'md5': '4b46ae6ea5e6e9086e714d883313c0c9', 'info_dict': { 'id': '4588838', 'ext': 'flv', 'title': 'Biker Takes his Girl', 'age_limit': 18, } } _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19' def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) video_url = compat_urllib_parse_unquote( self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL')) video_title = self._html_search_regex( r'<title>(.*?)\s+-\s+XVID', webpage, 'title') video_thumbnail = self._search_regex( r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) formats = [{ 'url': video_url, }] android_req = sanitized_Request(url) android_req.add_header('User-Agent', self._ANDROID_USER_AGENT) android_webpage = self._download_webpage(android_req, video_id, fatal=False) if android_webpage is not None: player_params_str = self._search_regex( 'mobileReplacePlayerDivTwoQual\(([^)]+)\)', android_webpage, 'player parameters', default='') player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(','))) if player_params: formats.extend([{ 'url': param, 'preference': -10, } for param in player_params if determine_ext(param) == 'mp4']) self._sort_formats(formats) return { 'id': video_id, 'formats': formats, 'title': video_title, 'ext': 'flv', 'thumbnail': video_thumbnail, 'age_limit': 18, } ����youtube-dl/youtube_dl/extractor/cnn.py��������������������������������������������������������������0000644�0000000�0000000�00000015032�12660177411�017171� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, parse_duration, url_basename, ) class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', 'md5': '3e6121ea48df7e2259fe73a0628605c4', 'info_dict': { 'id': 'sports/2013/06/09/nadal-1-on-1.cnn', 'ext': 'mp4', 'title': 'Nadal wins 8th French Open title', 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', 'duration': 135, 'upload_date': '20130609', }, }, { 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', 'info_dict': { 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', 'ext': 'mp4', 'title': "Student's epic speech stuns new freshmen", 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", 'upload_date': '20130821', } }, { 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', 'md5': 'f14d02ebd264df951feb2400e2c25a1b', 'info_dict': { 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', 'ext': 'mp4', 'title': 'Nashville Ep. 1: Hand crafted skateboards', 'description': 'md5:e7223a503315c9f150acac52e76de086', 'upload_date': '20141222', } }, { 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', 'only_matching': True, }, { 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) path = mobj.group('path') page_title = mobj.group('title') info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path info = self._download_xml(info_url, page_title) formats = [] rex = re.compile(r'''(?x) (?P<width>[0-9]+)x(?P<height>[0-9]+) (?:_(?P<bitrate>[0-9]+)k)? ''') for f in info.findall('files/file'): video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip()) fdct = { 'format_id': f.attrib['bitrate'], 'url': video_url, } mf = rex.match(f.attrib['bitrate']) if mf: fdct['width'] = int(mf.group('width')) fdct['height'] = int(mf.group('height')) fdct['tbr'] = int_or_none(mf.group('bitrate')) else: mf = rex.search(f.text) if mf: fdct['width'] = int(mf.group('width')) fdct['height'] = int(mf.group('height')) fdct['tbr'] = int_or_none(mf.group('bitrate')) else: mi = re.match(r'ios_(audio|[0-9]+)$', f.attrib['bitrate']) if mi: if mi.group(1) == 'audio': fdct['vcodec'] = 'none' fdct['ext'] = 'm4a' else: fdct['tbr'] = int(mi.group(1)) formats.append(fdct) self._sort_formats(formats) thumbnails = [{ 'height': int(t.attrib['height']), 'width': int(t.attrib['width']), 'url': t.text, } for t in info.findall('images/image')] metas_el = info.find('metas') upload_date = ( metas_el.attrib.get('version') if metas_el is not None else None) duration_el = info.find('length') duration = parse_duration(duration_el.text) return { 'id': info.attrib['id'], 'title': info.find('headline').text, 'formats': formats, 'thumbnails': thumbnails, 'description': info.find('description').text, 'duration': duration, 'upload_date': upload_date, } class CNNBlogsIE(InfoExtractor): _VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+' _TEST = { 'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/', 'md5': '3e56f97b0b6ffb4b79f4ea0749551084', 'info_dict': { 'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn', 'ext': 'mp4', 'title': 'Criminalizing journalism?', 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', 'upload_date': '20140209', }, 'add_ie': ['CNN'], } def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') return { '_type': 'url', 'url': cnn_url, 'ie_key': CNNIE.ie_key(), } class CNNArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)' _TEST = { 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', 'info_dict': { 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', 'ext': 'mp4', 'title': 'Obama: Cyberattack not an act of war', 'description': 'md5:51ce6750450603795cad0cdfbd7d05c5', 'upload_date': '20141221', }, 'add_ie': ['CNN'], } def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') return { '_type': 'url', 'url': 'http://cnn.com/video/?/video/' + cnn_url, 'ie_key': CNNIE.ie_key(), } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/gamekings.py��������������������������������������������������������0000644�0000000�0000000�00000005160�12653633132�020360� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( xpath_text, xpath_with_ns, ) from .youtube import YoutubeIE class GamekingsIE(InfoExtractor): _VALID_URL = r'http://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' _TESTS = [{ # YouTube embed video 'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', 'md5': '5208d3a17adeaef829a7861887cb9029', 'info_dict': { 'id': 'HkSQKetlGOU', 'ext': 'mp4', 'title': 'Phoenix Wright: Ace Attorney - Dual Destinies Review', 'description': 'md5:db88c0e7f47e9ea50df3271b9dc72e1d', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader_id': 'UCJugRGo4STYMeFr5RoOShtQ', 'uploader': 'Gamekings Vault', 'upload_date': '20151123', }, 'add_ie': ['Youtube'], }, { # vimeo video 'url': 'http://www.gamekings.nl/videos/the-legend-of-zelda-majoras-mask/', 'md5': '12bf04dfd238e70058046937657ea68d', 'info_dict': { 'id': 'the-legend-of-zelda-majoras-mask', 'ext': 'mp4', 'title': 'The Legend of Zelda: Majora’s Mask', 'description': 'md5:9917825fe0e9f4057601fe1e38860de3', 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://www.gamekings.nl/nieuws/gamekings-extra-shelly-en-david-bereiden-zich-voor-op-de-livestream/', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) playlist_id = self._search_regex( r'gogoVideo\([^,]+,\s*"([^"]+)', webpage, 'playlist id') # Check if a YouTube embed is used if YoutubeIE.suitable(playlist_id): return self.url_result(playlist_id, ie='Youtube') playlist = self._download_xml( 'http://www.gamekings.tv/wp-content/themes/gk2010/rss_playlist.php?id=%s' % playlist_id, video_id) NS_MAP = { 'jwplayer': 'http://rss.jwpcdn.com/' } item = playlist.find('./channel/item') thumbnail = xpath_text(item, xpath_with_ns('./jwplayer:image', NS_MAP), 'thumbnail') video_url = item.find(xpath_with_ns('./jwplayer:source', NS_MAP)).get('file') return { 'id': video_id, 'url': video_url, 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'thumbnail': thumbnail, } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/addanime.py���������������������������������������������������������0000644�0000000�0000000�00000006305�12641030331�020144� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, compat_urllib_parse, compat_urllib_parse_urlparse, ) from ..utils import ( ExtractorError, qualities, ) class AddAnimeIE(InfoExtractor): _VALID_URL = r'http://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P<id>[\w_]+)' _TESTS = [{ 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', 'md5': '72954ea10bc979ab5e2eb288b21425a0', 'info_dict': { 'id': '24MR3YO5SAS9', 'ext': 'mp4', 'description': 'One Piece 606', 'title': 'One Piece 606', } }, { 'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) try: webpage = self._download_webpage(url, video_id) except ExtractorError as ee: if not isinstance(ee.cause, compat_HTTPError) or \ ee.cause.code != 503: raise redir_webpage = ee.cause.read().decode('utf-8') action = self._search_regex( r'<form id="challenge-form" action="([^"]+)"', redir_webpage, 'Redirect form') vc = self._search_regex( r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>', redir_webpage, 'redirect vc value') av = re.search( r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', redir_webpage) if av is None: raise ExtractorError('Cannot find redirect math task') av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) parsed_url = compat_urllib_parse_urlparse(url) av_val = av_res + len(parsed_url.netloc) confirm_url = ( parsed_url.scheme + '://' + parsed_url.netloc + action + '?' + compat_urllib_parse.urlencode({ 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) self._download_webpage( confirm_url, video_id, note='Confirming after redirect') webpage = self._download_webpage(url, video_id) FORMATS = ('normal', 'hq') quality = qualities(FORMATS) formats = [] for format_id in FORMATS: rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id) video_url = self._search_regex(rex, webpage, 'video file URLx', fatal=False) if not video_url: continue formats.append({ 'format_id': format_id, 'url': video_url, 'quality': quality(format_id), }) self._sort_formats(formats) video_title = self._og_search_title(webpage) video_description = self._og_search_description(webpage) return { '_type': 'video', 'id': video_id, 'formats': formats, 'title': video_title, 'description': video_description } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������youtube-dl/youtube_dl/extractor/dhm.py��������������������������������������������������������������0000644�0000000�0000000�00000004051�12641030331�017146� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������from __future__ import unicode_literals from .common import InfoExtractor from ..utils import parse_duration class DHMIE(InfoExtractor): IE_DESC = 'Filmarchiv - Deutsches Historisches Museum' _VALID_URL = r'https?://(?:www\.)?dhm\.de/filmarchiv/(?:[^/]+/)+(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.dhm.de/filmarchiv/die-filme/the-marshallplan-at-work-in-west-germany/', 'md5': '11c475f670209bf6acca0b2b7ef51827', 'info_dict': { 'id': 'the-marshallplan-at-work-in-west-germany', 'ext': 'flv', 'title': 'MARSHALL PLAN AT WORK IN WESTERN GERMANY, THE', 'description': 'md5:1fabd480c153f97b07add61c44407c82', 'duration': 660, 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://www.dhm.de/filmarchiv/02-mapping-the-wall/peter-g/rolle-1/', 'md5': '09890226332476a3e3f6f2cb74734aa5', 'info_dict': { 'id': 'rolle-1', 'ext': 'flv', 'title': 'ROLLE 1', 'thumbnail': 're:^https?://.*\.jpg$', }, }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) playlist_url = self._search_regex( r"file\s*:\s*'([^']+)'", webpage, 'playlist url') entries = self._extract_xspf_playlist(playlist_url, playlist_id) title = self._search_regex( [r'dc:title="([^"]+)"', r'<title> »([^<]+)'], webpage, 'title').strip() description = self._html_search_regex( r'

    Description:(.+?)

    ', webpage, 'description', default=None) duration = parse_duration(self._search_regex( r'Length\s*\s*:\s*([^<]+)', webpage, 'duration', default=None)) entries[0].update({ 'title': title, 'description': description, 'duration': duration, }) return self.playlist_result(entries, playlist_id) youtube-dl/youtube_dl/extractor/amp.py0000644000000000000000000000622612644050477017201 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, ) class AMPIE(InfoExtractor): # parse Akamai Adaptive Media Player feed def _extract_feed_info(self, url): item = self._download_json( url, None, 'Downloading Akamai AMP feed', 'Unable to download Akamai AMP feed')['channel']['item'] video_id = item['guid'] def get_media_node(name, default=None): media_name = 'media-%s' % name media_group = item.get('media-group') or item return media_group.get(media_name) or item.get(media_name) or item.get(name, default) thumbnails = [] media_thumbnail = get_media_node('thumbnail') if media_thumbnail: if isinstance(media_thumbnail, dict): media_thumbnail = [media_thumbnail] for thumbnail_data in media_thumbnail: thumbnail = thumbnail_data['@attributes'] thumbnails.append({ 'url': self._proto_relative_url(thumbnail['url'], 'http:'), 'width': int_or_none(thumbnail.get('width')), 'height': int_or_none(thumbnail.get('height')), }) subtitles = {} media_subtitle = get_media_node('subTitle') if media_subtitle: if isinstance(media_subtitle, dict): media_subtitle = [media_subtitle] for subtitle_data in media_subtitle: subtitle = subtitle_data['@attributes'] lang = subtitle.get('lang') or 'en' subtitles[lang] = [{'url': subtitle['href']}] formats = [] media_content = get_media_node('content') if isinstance(media_content, dict): media_content = [media_content] for media_data in media_content: media = media_data['@attributes'] media_type = media['type'] if media_type == 'video/f4m': formats.extend(self._extract_f4m_formats( media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)) elif media_type == 'application/x-mpegURL': formats.extend(self._extract_m3u8_formats( media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.append({ 'format_id': media_data['media-category']['@attributes']['label'], 'url': media['url'], 'tbr': int_or_none(media.get('bitrate')), 'filesize': int_or_none(media.get('fileSize')), }) self._sort_formats(formats) return { 'id': video_id, 'title': get_media_node('title'), 'description': get_media_node('description'), 'thumbnails': thumbnails, 'timestamp': parse_iso8601(item.get('pubDate'), ' '), 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), 'subtitles': subtitles, 'formats': formats, } youtube-dl/youtube_dl/extractor/ehow.py0000644000000000000000000000275612641030331017352 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote class EHowIE(InfoExtractor): IE_NAME = 'eHow' _VALID_URL = r'https?://(?:www\.)?ehow\.com/[^/_?]*_(?P[0-9]+)' _TEST = { 'url': 'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html', 'md5': '9809b4e3f115ae2088440bcb4efbf371', 'info_dict': { 'id': '12245069', 'ext': 'flv', 'title': 'Hardwood Flooring Basics', 'description': 'Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...', 'uploader': 'Erick Nathan', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._search_regex( r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL') final_url = compat_urllib_parse_unquote(video_url) uploader = self._html_search_meta('uploader', webpage) title = self._og_search_title(webpage).replace(' | eHow', '') return { 'id': video_id, 'url': final_url, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), 'uploader': uploader, } youtube-dl/youtube_dl/extractor/slutload.py0000644000000000000000000000255612660177411020251 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor class SlutloadIE(InfoExtractor): _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P[^/]+)/?$' _TEST = { 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'md5': '0cf531ae8006b530bd9df947a6a0df77', 'info_dict': { 'id': 'TD73btpBqSxc', 'ext': 'mp4', 'title': 'virginie baisee en cam', 'age_limit': 18, 'thumbnail': 're:https?://.*?\.jpg' } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) video_title = self._html_search_regex(r'

    ([^<]+)', webpage, 'title').strip() video_url = self._html_search_regex( r'(?s)
    [^/]+)/?' _TEST = { 'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/', 'md5': '2a933874cb7dce4366075281eb49e855', 'info_dict': { 'id': '76337', 'ext': 'mp4', 'title': '۵ واکنش برتر دروازه‌بانان؛هفته ۲۶ بوندسلیگا', 'description': 'فصل ۲۰۱۵-۲۰۱۴', 'thumbnail': 're:^https?://.*\.jpg$', } } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_url = self._search_regex( r']+src="([^"]+)"', webpage, 'video url') title = self._og_search_title(webpage) description = self._html_search_regex( r'(?s)
    (.+?)
    ', webpage, 'description', fatal=False) thumbnail = self._og_search_thumbnail(webpage) video_id = self._search_regex( r"]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'", webpage, display_id, default=display_id) return { 'url': video_url, 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, } youtube-dl/youtube_dl/extractor/ina.py0000644000000000000000000000205012641030331017142 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class InaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ina\.fr/video/(?PI?[A-Z0-9]+)' _TEST = { 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', 'info_dict': { 'id': 'I12055569', 'ext': 'mp4', 'title': 'François Hollande "Je crois que c\'est clair"', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') mrss_url = 'http://player.ina.fr/notices/%s.mrss' % video_id info_doc = self._download_xml(mrss_url, video_id) self.report_extraction(video_id) video_url = info_doc.find('.//{http://search.yahoo.com/mrss/}player').attrib['url'] return { 'id': video_id, 'url': video_url, 'title': info_doc.find('.//title').text, } youtube-dl/youtube_dl/extractor/chilloutzone.py0000644000000000000000000000702012641030331021114 0ustar rootrootfrom __future__ import unicode_literals import re import base64 import json from .common import InfoExtractor from ..utils import ( clean_html, ExtractorError ) class ChilloutzoneIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P[\w|-]+)\.html' _TESTS = [{ 'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', 'md5': 'a76f3457e813ea0037e5244f509e66d1', 'info_dict': { 'id': 'enemene-meck-alle-katzen-weg', 'ext': 'mp4', 'title': 'Enemene Meck - Alle Katzen weg', 'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?', }, }, { 'note': 'Video hosted at YouTube', 'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html', 'info_dict': { 'id': '1YVQaAgHyRU', 'ext': 'mp4', 'title': '16 Photos Taken 1 Second Before Disaster', 'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814', 'uploader': 'BuzzFeedVideo', 'uploader_id': 'BuzzFeedVideo', 'upload_date': '20131105', }, }, { 'note': 'Video hosted at Vimeo', 'url': 'http://www.chilloutzone.net/video/icon-blending.html', 'md5': '2645c678b8dc4fefcc0e1b60db18dac1', 'info_dict': { 'id': '85523671', 'ext': 'mp4', 'title': 'The Sunday Times - Icons', 'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}', 'uploader': 'Us', 'uploader_id': 'usfilms', 'upload_date': '20140131' }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) base64_video_info = self._html_search_regex( r'var cozVidData = "(.+?)";', webpage, 'video data') decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8') video_info_dict = json.loads(decoded_video_info) # get video information from dict video_url = video_info_dict['mediaUrl'] description = clean_html(video_info_dict.get('description')) title = video_info_dict['title'] native_platform = video_info_dict['nativePlatform'] native_video_id = video_info_dict['nativeVideoId'] source_priority = video_info_dict['sourcePriority'] # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) if native_platform is None: youtube_url = self._html_search_regex( r'[^/]+)\.html' _TEST = { 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', 'info_dict': { 'id': '3792260579001', 'ext': 'mp4', 'title': 'The Slum - Episode 1: Deliverance', 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', 'uploader': 'Al Jazeera English', }, 'add_ie': ['BrightcoveLegacy'], 'skip': 'Not accessible from Travis CI server', } def _real_extract(self, url): program_name = self._match_id(url) webpage = self._download_webpage(url, program_name) brightcove_id = self._search_regex( r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') return { '_type': 'url', 'url': ( 'brightcove:' 'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc' '&%40videoPlayer={0}'.format(brightcove_id) ), 'ie_key': 'BrightcoveLegacy', } youtube-dl/youtube_dl/extractor/azubu.py0000644000000000000000000001124212652732456017547 0ustar rootrootfrom __future__ import unicode_literals import json from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, sanitized_Request, ) class AzubuIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?azubu\.tv/[^/]+#!/play/(?P\d+)' _TESTS = [ { 'url': 'http://www.azubu.tv/GSL#!/play/15575/2014-hot6-cup-last-big-match-ro8-day-1', 'md5': 'a88b42fcf844f29ad6035054bd9ecaf4', 'info_dict': { 'id': '15575', 'ext': 'mp4', 'title': '2014 HOT6 CUP LAST BIG MATCH Ro8 Day 1', 'description': 'md5:d06bdea27b8cc4388a90ad35b5c66c01', 'thumbnail': 're:^https?://.*\.jpe?g', 'timestamp': 1417523507.334, 'upload_date': '20141202', 'duration': 9988.7, 'uploader': 'GSL', 'uploader_id': 414310, 'view_count': int, }, }, { 'url': 'http://www.azubu.tv/FnaticTV#!/play/9344/-fnatic-at-worlds-2014:-toyz---%22i-love-rekkles,-he-has-amazing-mechanics%22-', 'md5': 'b72a871fe1d9f70bd7673769cdb3b925', 'info_dict': { 'id': '9344', 'ext': 'mp4', 'title': 'Fnatic at Worlds 2014: Toyz - "I love Rekkles, he has amazing mechanics"', 'description': 'md5:4a649737b5f6c8b5c5be543e88dc62af', 'thumbnail': 're:^https?://.*\.jpe?g', 'timestamp': 1410530893.320, 'upload_date': '20140912', 'duration': 172.385, 'uploader': 'FnaticTV', 'uploader_id': 272749, 'view_count': int, }, }, ] def _real_extract(self, url): video_id = self._match_id(url) data = self._download_json( 'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data'] title = data['title'].strip() description = data['description'] thumbnail = data['thumbnail'] view_count = data['view_count'] uploader = data['user']['username'] uploader_id = data['user']['id'] stream_params = json.loads(data['stream_params']) timestamp = float_or_none(stream_params['creationDate'], 1000) duration = float_or_none(stream_params['length'], 1000) renditions = stream_params.get('renditions') or [] video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength') if video: renditions.append(video) formats = [{ 'url': fmt['url'], 'width': fmt['frameWidth'], 'height': fmt['frameHeight'], 'vbr': float_or_none(fmt['encodingRate'], 1000), 'filesize': fmt['size'], 'vcodec': fmt['videoCodec'], 'container': fmt['videoContainer'], } for fmt in renditions if fmt['url']] self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, 'uploader': uploader, 'uploader_id': uploader_id, 'view_count': view_count, 'formats': formats, } class AzubuLiveIE(InfoExtractor): _VALID_URL = r'http://www.azubu.tv/(?P[^/]+)$' _TEST = { 'url': 'http://www.azubu.tv/MarsTVMDLen', 'only_matching': True, } def _real_extract(self, url): user = self._match_id(url) info = self._download_json( 'http://api.azubu.tv/public/modules/last-video/{0}/info'.format(user), user)['data'] if info['type'] != 'STREAM': raise ExtractorError('{0} is not streaming live'.format(user), expected=True) req = sanitized_Request( 'https://edge-elb.api.brightcove.com/playback/v1/accounts/3361910549001/videos/ref:' + info['reference_id']) req.add_header('Accept', 'application/json;pk=BCpkADawqM1gvI0oGWg8dxQHlgT8HkdE2LnAlWAZkOlznO39bSZX726u4JqnDsK3MDXcO01JxXK2tZtJbgQChxgaFzEVdHRjaDoxaOu8hHOO8NYhwdxw9BzvgkvLUlpbDNUuDoc4E4wxDToV') bc_info = self._download_json(req, user) m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS') formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4') return { 'id': info['id'], 'title': self._live_title(info['title']), 'uploader_id': user, 'formats': formats, 'is_live': True, 'thumbnail': bc_info['poster'], } youtube-dl/youtube_dl/extractor/niconico.py0000644000000000000000000002515012641030331020202 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re import json import datetime from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urlparse, ) from ..utils import ( encode_dict, ExtractorError, int_or_none, parse_duration, parse_iso8601, sanitized_Request, xpath_text, determine_ext, ) class NiconicoIE(InfoExtractor): IE_NAME = 'niconico' IE_DESC = 'ニコニコ動画' _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', 'md5': 'd1a75c0823e2f629128c43e1212760f9', 'info_dict': { 'id': 'sm22312215', 'ext': 'mp4', 'title': 'Big Buck Bunny', 'uploader': 'takuya0301', 'uploader_id': '2698420', 'upload_date': '20131123', 'timestamp': 1385182762, 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', 'duration': 33, }, }, { # File downloaded with and without credentials are different, so omit # the md5 field 'url': 'http://www.nicovideo.jp/watch/nm14296458', 'info_dict': { 'id': 'nm14296458', 'ext': 'swf', 'title': '【鏡音リン】Dance on media【オリジナル】take2!', 'description': 'md5:689f066d74610b3b22e0f1739add0f58', 'uploader': 'りょうた', 'uploader_id': '18822557', 'upload_date': '20110429', 'timestamp': 1304065916, 'duration': 209, }, }, { # 'video exists but is marked as "deleted" # md5 is unstable 'url': 'http://www.nicovideo.jp/watch/sm10000', 'info_dict': { 'id': 'sm10000', 'ext': 'unknown_video', 'description': 'deleted', 'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>', 'upload_date': '20071224', 'timestamp': 1198527840, # timestamp field has different value if logged in 'duration': 304, }, }, { 'url': 'http://www.nicovideo.jp/watch/so22543406', 'info_dict': { 'id': '1388129933', 'ext': 'mp4', 'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~', 'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1', 'timestamp': 1388851200, 'upload_date': '20140104', 'uploader': 'アニメロチャンネル', 'uploader_id': '312', } }] _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' # Determine whether the downloader used authentication to download video _AUTHENTICATED = False def _real_initialize(self): self._login() def _login(self): (username, password) = self._get_login_info() # No authentication to be performed if not username: return True # Log in login_form_strs = { 'mail': username, 'password': password, } login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8') request = sanitized_Request( 'https://secure.nicovideo.jp/secure/login', login_data) login_results = self._download_webpage( request, None, note='Logging in', errnote='Unable to log in') if re.search(r'(?i)

    Log in error

    ', login_results) is not None: self._downloader.report_warning('unable to log in: bad username or password') return False # Successful login self._AUTHENTICATED = True return True def _real_extract(self, url): video_id = self._match_id(url) # Get video webpage. We are not actually interested in it for normal # cases, but need the cookies in order to be able to download the # info webpage webpage, handle = self._download_webpage_handle( 'http://www.nicovideo.jp/watch/' + video_id, video_id) if video_id.startswith('so'): video_id = self._match_id(handle.geturl()) video_info = self._download_xml( 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, note='Downloading video info page') if self._AUTHENTICATED: # Get flv info flv_info_webpage = self._download_webpage( 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', video_id, 'Downloading flv info') else: # Get external player info ext_player_info = self._download_webpage( 'http://ext.nicovideo.jp/thumb_watch/' + video_id, video_id) thumb_play_key = self._search_regex( r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey') # Get flv info flv_info_data = compat_urllib_parse.urlencode({ 'k': thumb_play_key, 'v': video_id }) flv_info_request = sanitized_Request( 'http://ext.nicovideo.jp/thumb_watch', flv_info_data, {'Content-Type': 'application/x-www-form-urlencoded'}) flv_info_webpage = self._download_webpage( flv_info_request, video_id, note='Downloading flv info', errnote='Unable to download flv info') flv_info = compat_urlparse.parse_qs(flv_info_webpage) if 'url' not in flv_info: if 'deleted' in flv_info: raise ExtractorError('The video has been deleted.', expected=True) else: raise ExtractorError('Unable to find video URL') video_real_url = flv_info['url'][0] # Start extracting information title = xpath_text(video_info, './/title') if not title: title = self._og_search_title(webpage, default=None) if not title: title = self._html_search_regex( r']+class="videoHeaderTitle"[^>]*>([^<]+)', webpage, 'video title') watch_api_data_string = self._html_search_regex( r']+id="watchAPIDataContainer"[^>]+>([^<]+)
    ', webpage, 'watch api data', default=None) watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {} video_detail = watch_api_data.get('videoDetail', {}) extension = xpath_text(video_info, './/movie_type') if not extension: extension = determine_ext(video_real_url) thumbnail = ( xpath_text(video_info, './/thumbnail_url') or self._html_search_meta('image', webpage, 'thumbnail', default=None) or video_detail.get('thumbnail')) description = xpath_text(video_info, './/description') timestamp = parse_iso8601(xpath_text(video_info, './/first_retrieve')) if not timestamp: match = self._html_search_meta('datePublished', webpage, 'date published', default=None) if match: timestamp = parse_iso8601(match.replace('+', ':00+')) if not timestamp and video_detail.get('postedAt'): timestamp = parse_iso8601( video_detail['postedAt'].replace('/', '-'), delimiter=' ', timezone=datetime.timedelta(hours=9)) view_count = int_or_none(xpath_text(video_info, './/view_counter')) if not view_count: match = self._html_search_regex( r'>Views: ]*>([^<]+)', webpage, 'view count', default=None) if match: view_count = int_or_none(match.replace(',', '')) view_count = view_count or video_detail.get('viewCount') comment_count = int_or_none(xpath_text(video_info, './/comment_num')) if not comment_count: match = self._html_search_regex( r'>Comments: ]*>([^<]+)', webpage, 'comment count', default=None) if match: comment_count = int_or_none(match.replace(',', '')) comment_count = comment_count or video_detail.get('commentCount') duration = (parse_duration( xpath_text(video_info, './/length') or self._html_search_meta( 'video:duration', webpage, 'video duration', default=None)) or video_detail.get('length')) webpage_url = xpath_text(video_info, './/watch_url') or url if video_info.find('.//ch_id') is not None: uploader_id = video_info.find('.//ch_id').text uploader = video_info.find('.//ch_name').text elif video_info.find('.//user_id') is not None: uploader_id = video_info.find('.//user_id').text uploader = video_info.find('.//user_nickname').text else: uploader_id = uploader = None return { 'id': video_id, 'url': video_real_url, 'title': title, 'ext': extension, 'format_id': 'economy' if video_real_url.endswith('low') else 'normal', 'thumbnail': thumbnail, 'description': description, 'uploader': uploader, 'timestamp': timestamp, 'uploader_id': uploader_id, 'view_count': view_count, 'comment_count': comment_count, 'duration': duration, 'webpage_url': webpage_url, } class NiconicoPlaylistIE(InfoExtractor): _VALID_URL = r'https?://www\.nicovideo\.jp/mylist/(?P\d+)' _TEST = { 'url': 'http://www.nicovideo.jp/mylist/27411728', 'info_dict': { 'id': '27411728', 'title': 'AKB48のオールナイトニッポン', }, 'playlist_mincount': 225, } def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);', webpage, 'entries') entries = json.loads(entries_json) entries = [{ '_type': 'url', 'ie_key': NiconicoIE.ie_key(), 'url': ('http://www.nicovideo.jp/watch/%s' % entry['item_data']['video_id']), } for entry in entries] return { '_type': 'playlist', 'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'), 'id': list_id, 'entries': entries, } youtube-dl/youtube_dl/extractor/trollvids.py0000644000000000000000000000214112650650456020436 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .nuevo import NuevoBaseIE class TrollvidsIE(NuevoBaseIE): _VALID_URL = r'http://(?:www\.)?trollvids\.com/video/(?P\d+)/(?P[^/?#&]+)' IE_NAME = 'trollvids' _TEST = { 'url': 'http://trollvids.com/video/2349002/%E3%80%90MMD-R-18%E3%80%91%E3%82%AC%E3%83%BC%E3%83%AB%E3%83%95%E3%83%AC%E3%83%B3%E3%83%89-carrymeoff', 'md5': '1d53866b2c514b23ed69e4352fdc9839', 'info_dict': { 'id': '2349002', 'ext': 'mp4', 'title': '【MMD R-18】ガールフレンド carry_me_off', 'age_limit': 18, 'duration': 216.78, }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') info = self._extract_nuevo( 'http://trollvids.com/nuevo/player/config.php?v=%s' % video_id, video_id) info.update({ 'display_id': display_id, 'age_limit': 18 }) return info youtube-dl/youtube_dl/extractor/mixcloud.py0000644000000000000000000001001512641030331020217 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, HEADRequest, str_to_int, ) class MixcloudIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)' IE_NAME = 'mixcloud' _TESTS = [{ 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/', 'info_dict': { 'id': 'dholbach-cryptkeeper', 'ext': 'mp3', 'title': 'Cryptkeeper', 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', 'uploader': 'Daniel Holbach', 'uploader_id': 'dholbach', 'thumbnail': 're:https?://.*\.jpg', 'view_count': int, 'like_count': int, }, }, { 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', 'info_dict': { 'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat', 'ext': 'mp3', 'title': 'Caribou 7 inch Vinyl Mix & Chat', 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', 'uploader': 'Gilles Peterson Worldwide', 'uploader_id': 'gillespeterson', 'thumbnail': 're:https?://.*/images/', 'view_count': int, 'like_count': int, }, }] def _check_url(self, url, track_id, ext): try: # We only want to know if the request succeed # don't download the whole file self._request_webpage( HEADRequest(url), track_id, 'Trying %s URL' % ext) return True except ExtractorError: return False def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group(1) cloudcast_name = mobj.group(2) track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name))) webpage = self._download_webpage(url, track_id) preview_url = self._search_regex( r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url') song_url = re.sub(r'audiocdn(\d+)', r'stream\1', preview_url) song_url = song_url.replace('/previews/', '/c/originals/') if not self._check_url(song_url, track_id, 'mp3'): song_url = song_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') if not self._check_url(song_url, track_id, 'm4a'): raise ExtractorError('Unable to extract track url') PREFIX = ( r'm-play-on-spacebar[^>]+' r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+') title = self._html_search_regex( PREFIX + r'm-title="([^"]+)"', webpage, 'title') thumbnail = self._proto_relative_url(self._html_search_regex( PREFIX + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) uploader = self._html_search_regex( PREFIX + r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) uploader_id = self._search_regex( r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) description = self._og_search_description(webpage) like_count = str_to_int(self._search_regex( r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"', webpage, 'like count', fatal=False)) view_count = str_to_int(self._search_regex( [r'([0-9,.]+)
    '], webpage, 'play count', fatal=False)) return { 'id': track_id, 'title': title, 'url': song_url, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, 'uploader_id': uploader_id, 'view_count': view_count, 'like_count': like_count, } youtube-dl/youtube_dl/extractor/dramafever.py0000644000000000000000000001647112644050477020543 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import itertools from .amp import AMPIE from ..compat import ( compat_HTTPError, compat_urllib_parse, compat_urlparse, ) from ..utils import ( ExtractorError, clean_html, int_or_none, sanitized_Request, ) class DramaFeverBaseIE(AMPIE): _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' _CONSUMER_SECRET = 'DA59dtVXYLxajktV' _consumer_secret = None def _get_consumer_secret(self): mainjs = self._download_webpage( 'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js', None, 'Downloading main.js', fatal=False) if not mainjs: return self._CONSUMER_SECRET return self._search_regex( r"var\s+cs\s*=\s*'([^']+)'", mainjs, 'consumer secret', default=self._CONSUMER_SECRET) def _real_initialize(self): self._login() self._consumer_secret = self._get_consumer_secret() def _login(self): (username, password) = self._get_login_info() if username is None: return login_form = { 'username': username, 'password': password, } request = sanitized_Request( self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) response = self._download_webpage( request, None, 'Logging in as %s' % username) if all(logout_pattern not in response for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): error = self._html_search_regex( r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<', response, 'error message', default=None) if error: raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') class DramaFeverIE(DramaFeverBaseIE): IE_NAME = 'dramafever' _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+/[0-9]+)(?:/|$)' _TESTS = [{ 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', 'info_dict': { 'id': '4512.1', 'ext': 'mp4', 'title': 'Cooking with Shin 4512.1', 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', 'episode': 'Episode 1', 'episode_number': 1, 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1404336058, 'upload_date': '20140702', 'duration': 343, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://www.dramafever.com/drama/4826/4/Mnet_Asian_Music_Awards_2015/?ap=1', 'info_dict': { 'id': '4826.4', 'ext': 'mp4', 'title': 'Mnet Asian Music Awards 2015 4826.4', 'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91', 'episode': 'Mnet Asian Music Awards 2015 - Part 3', 'episode_number': 4, 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1450213200, 'upload_date': '20151215', 'duration': 5602, }, 'params': { # m3u8 download 'skip_download': True, }, }] def _real_extract(self, url): video_id = self._match_id(url).replace('/', '.') try: info = self._extract_feed_info( 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): raise ExtractorError( 'Currently unavailable in your country.', expected=True) raise series_id, episode_number = video_id.split('.') episode_info = self._download_json( # We only need a single episode info, so restricting page size to one episode # and dealing with page number as with episode number r'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_number=%s&page_size=1' % (self._consumer_secret, series_id, episode_number), video_id, 'Downloading episode info JSON', fatal=False) if episode_info: value = episode_info.get('value') if isinstance(value, list): for v in value: if v.get('type') == 'Episode': subfile = v.get('subfile') or v.get('new_subfile') if subfile and subfile != 'http://www.dramafever.com/st/': info.setdefault('subtitles', {}).setdefault('English', []).append({ 'ext': 'srt', 'url': subfile, }) episode_number = int_or_none(v.get('number')) episode_fallback = 'Episode' if episode_number: episode_fallback += ' %d' % episode_number info['episode'] = v.get('title') or episode_fallback info['episode_number'] = episode_number break return info class DramaFeverSeriesIE(DramaFeverBaseIE): IE_NAME = 'dramafever:series' _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$' _TESTS = [{ 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/', 'info_dict': { 'id': '4512', 'title': 'Cooking with Shin', 'description': 'md5:84a3f26e3cdc3fb7f500211b3593b5c1', }, 'playlist_count': 4, }, { 'url': 'http://www.dramafever.com/drama/124/IRIS/', 'info_dict': { 'id': '124', 'title': 'IRIS', 'description': 'md5:b3a30e587cf20c59bd1c01ec0ee1b862', }, 'playlist_count': 20, }] _PAGE_SIZE = 60 # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-) def _real_extract(self, url): series_id = self._match_id(url) series = self._download_json( 'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s' % (self._consumer_secret, series_id), series_id, 'Downloading series JSON')['series'][series_id] title = clean_html(series['name']) description = clean_html(series.get('description') or series.get('description_short')) entries = [] for page_num in itertools.count(1): episodes = self._download_json( 'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d' % (self._consumer_secret, series_id, self._PAGE_SIZE, page_num), series_id, 'Downloading episodes JSON page #%d' % page_num) for episode in episodes.get('value', []): episode_url = episode.get('episode_url') if not episode_url: continue entries.append(self.url_result( compat_urlparse.urljoin(url, episode_url), 'DramaFever', episode.get('guid'))) if page_num == episodes['num_pages']: break return self.playlist_result(entries, series_id, title, description) youtube-dl/youtube_dl/extractor/crunchyroll.py0000644000000000000000000004422412660177411020764 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re import json import base64 import zlib from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_request, compat_urlparse, ) from ..utils import ( ExtractorError, bytes_to_intlist, intlist_to_bytes, int_or_none, lowercase_escape, remove_end, sanitized_Request, unified_strdate, urlencode_postdata, xpath_text, ) from ..aes import ( aes_cbc_decrypt, ) class CrunchyrollBaseIE(InfoExtractor): _NETRC_MACHINE = 'crunchyroll' def _login(self): (username, password) = self._get_login_info() if username is None: return self.report_login() login_url = 'https://www.crunchyroll.com/?a=formhandler' data = urlencode_postdata({ 'formname': 'RpcApiUser_Login', 'name': username, 'password': password, }) login_request = sanitized_Request(login_url, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') self._download_webpage(login_request, None, False, 'Wrong login info') def _real_initialize(self): self._login() def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) else sanitized_Request(url_or_request)) # Accept-Language must be set explicitly to accept any language to avoid issues # similar to https://github.com/rg3/youtube-dl/issues/6797. # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction # should be imposed or not (from what I can see it just takes the first language # ignoring the priority and requires it to correspond the IP). By the way this causes # Crunchyroll to not work in georestriction cases in some browsers that don't place # the locale lang first in header. However allowing any language seems to workaround the issue. request.add_header('Accept-Language', '*') return super(CrunchyrollBaseIE, self)._download_webpage( request, video_id, note, errnote, fatal, tries, timeout, encoding) @staticmethod def _add_skip_wall(url): parsed_url = compat_urlparse.urlparse(url) qs = compat_urlparse.parse_qs(parsed_url.query) # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message: # > This content may be inappropriate for some people. # > Are you sure you want to continue? # since it's not disabled by default in crunchyroll account's settings. # See https://github.com/rg3/youtube-dl/issues/7202. qs['skip_wall'] = ['1'] return compat_urlparse.urlunparse( parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) class CrunchyrollIE(CrunchyrollBaseIE): _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P[0-9]+))(?:[/?&]|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { 'id': '645513', 'ext': 'flv', 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', 'description': 'md5:2d17137920c64f2f49981a7797d275ef', 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', 'uploader': 'Yomiuri Telecasting Corporation (YTV)', 'upload_date': '20131013', 'url': 're:(?!.*&)', }, 'params': { # rtmp 'skip_download': True, }, }, { 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1', 'info_dict': { 'id': '589804', 'ext': 'flv', 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Danny Choo Network', 'upload_date': '20120213', }, 'params': { # rtmp 'skip_download': True, }, }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, }, { # geo-restricted (US), 18+ maturity wall, non-premium available 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617', 'only_matching': True, }] _FORMAT_IDS = { '360': ('60', '106'), '480': ('61', '106'), '720': ('62', '106'), '1080': ('80', '108'), } def _decrypt_subtitles(self, data, iv, id): data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) id = int(id) def obfuscate_key_aux(count, modulo, start): output = list(start) for _ in range(count): output.append(output[-1] + output[-2]) # cut off start values output = output[2:] output = list(map(lambda x: x % modulo + 33, output)) return output def obfuscate_key(key): num1 = int(floor(pow(2, 25) * sqrt(6.9))) num2 = (num1 ^ key) << 5 num3 = key ^ num1 num4 = num3 ^ (num3 >> 3) ^ num2 prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest()) # Extend 160 Bit hash to 256 Bit return shaHash + [0] * 12 key = obfuscate_key(id) decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) return zlib.decompress(decrypted_data) def _convert_subtitles_to_srt(self, sub_root): output = '' for i, event in enumerate(sub_root.findall('./events/event'), 1): start = event.attrib['start'].replace('.', ',') end = event.attrib['end'].replace('.', ',') text = event.attrib['text'].replace('\\N', '\n') output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text) return output def _convert_subtitles_to_ass(self, sub_root): output = '' def ass_bool(strvalue): assvalue = '0' if strvalue == '1': assvalue = '-1' return assvalue output = '[Script Info]\n' output += 'Title: %s\n' % sub_root.attrib['title'] output += 'ScriptType: v4.00+\n' output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style'] output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x'] output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y'] output += """ScaledBorderAndShadow: yes [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding """ for style in sub_root.findall('./styles/style'): output += 'Style: ' + style.attrib['name'] output += ',' + style.attrib['font_name'] output += ',' + style.attrib['font_size'] output += ',' + style.attrib['primary_colour'] output += ',' + style.attrib['secondary_colour'] output += ',' + style.attrib['outline_colour'] output += ',' + style.attrib['back_colour'] output += ',' + ass_bool(style.attrib['bold']) output += ',' + ass_bool(style.attrib['italic']) output += ',' + ass_bool(style.attrib['underline']) output += ',' + ass_bool(style.attrib['strikeout']) output += ',' + style.attrib['scale_x'] output += ',' + style.attrib['scale_y'] output += ',' + style.attrib['spacing'] output += ',' + style.attrib['angle'] output += ',' + style.attrib['border_style'] output += ',' + style.attrib['outline'] output += ',' + style.attrib['shadow'] output += ',' + style.attrib['alignment'] output += ',' + style.attrib['margin_l'] output += ',' + style.attrib['margin_r'] output += ',' + style.attrib['margin_v'] output += ',' + style.attrib['encoding'] output += '\n' output += """ [Events] Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text """ for event in sub_root.findall('./events/event'): output += 'Dialogue: 0' output += ',' + event.attrib['start'] output += ',' + event.attrib['end'] output += ',' + event.attrib['style'] output += ',' + event.attrib['name'] output += ',' + event.attrib['margin_l'] output += ',' + event.attrib['margin_r'] output += ',' + event.attrib['margin_v'] output += ',' + event.attrib['effect'] output += ',' + event.attrib['text'] output += '\n' return output def _extract_subtitles(self, subtitle): sub_root = compat_etree_fromstring(subtitle) return [{ 'ext': 'srt', 'data': self._convert_subtitles_to_srt(sub_root), }, { 'ext': 'ass', 'data': self._convert_subtitles_to_ass(sub_root), }] def _get_subtitles(self, video_id, webpage): subtitles = {} for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage): sub_page = self._download_webpage( 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, video_id, note='Downloading subtitles for ' + sub_name) id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) iv = self._search_regex(r'([^<]+)', sub_page, 'subtitle_iv', fatal=False) data = self._search_regex(r'([^<]+)', sub_page, 'subtitle_data', fatal=False) if not id or not iv or not data: continue subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) if not lang_code: continue subtitles[lang_code] = self._extract_subtitles(subtitle) return subtitles def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') if mobj.group('prefix') == 'm': mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage') webpage_url = self._search_regex(r'', mobile_webpage, 'webpage_url') else: webpage_url = 'http://www.' + mobj.group('url') webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage') note_m = self._html_search_regex( r'
    (.+?)
    ', webpage, 'trailer-notice', default='') if note_m: raise ExtractorError(note_m) mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P{.+?})\]\)', webpage) if mobj: msg = json.loads(mobj.group('msg')) if msg.get('type') == 'error': raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) if 'To view this, please log in to verify you are 18 or older.' in webpage: self.raise_login_required() video_title = self._html_search_regex( r'(?s)]*>((?:(?!]+itemprop=["\']title["\'][^>]*>(?:(?!', webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex( r']*>\s*.+?\[media_id=%s\].+?"description"\s*:\s*"([^"]+)' % video_id, webpage, 'description', default=None) if video_description: video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_upload_date = self._html_search_regex( [r'
    Availability for free users:(.+?)
    ', r'
    [^<>]+\s*(.+?\d{4})\s*
    '], webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) if video_upload_date: video_upload_date = unified_strdate(video_upload_date) video_uploader = self._html_search_regex( r']+href="/publisher/[^"]+"[^>]*>([^<]+)', webpage, 'video_uploader', fatal=False) playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) playerdata_req = sanitized_Request(playerdata_url) playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url}) playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') stream_id = self._search_regex(r'([^<]+)', playerdata, 'stream_id') video_thumbnail = self._search_regex(r'([^<]+)', playerdata, 'thumbnail', fatal=False) formats = [] for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage): stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' streamdata_req = sanitized_Request( 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' % (stream_id, stream_format, stream_quality), compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8')) streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') streamdata = self._download_xml( streamdata_req, video_id, note='Downloading media info for %s' % video_format) stream_info = streamdata.find('./{default}preload/stream_info') video_url = xpath_text(stream_info, './host') video_play_path = xpath_text(stream_info, './file') if not video_url or not video_play_path: continue metadata = stream_info.find('./metadata') format_info = { 'format': video_format, 'format_id': video_format, 'height': int_or_none(xpath_text(metadata, './height')), 'width': int_or_none(xpath_text(metadata, './width')), } if '.fplive.net/' in video_url: video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) parsed_video_url = compat_urlparse.urlparse(video_url) direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( netloc='v.lvlt.crcdn.net', path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1]))) if self._is_valid_url(direct_video_url, video_id, video_format): format_info.update({ 'url': direct_video_url, }) formats.append(format_info) continue format_info.update({ 'url': video_url, 'play_path': video_play_path, 'ext': 'flv', }) formats.append(format_info) subtitles = self.extract_subtitles(video_id, webpage) return { 'id': video_id, 'title': video_title, 'description': video_description, 'thumbnail': video_thumbnail, 'uploader': video_uploader, 'upload_date': video_upload_date, 'subtitles': subtitles, 'formats': formats, } class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = 'crunchyroll:playlist' _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P[\w\-]+))/?(?:\?|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', 'info_dict': { 'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' }, 'playlist_count': 13, }, { # geo-restricted (US), 18+ maturity wall, non-premium available 'url': 'http://www.crunchyroll.com/cosplay-complex-ova', 'info_dict': { 'id': 'cosplay-complex-ova', 'title': 'Cosplay Complex OVA' }, 'playlist_count': 3, 'skip': 'Georestricted', }, { # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', 'only_matching': True, }] def _real_extract(self, url): show_id = self._match_id(url) webpage = self._download_webpage(self._add_skip_wall(url), show_id) title = self._html_search_regex( r'(?s)]*>\s*(.*?)', webpage, 'title') episode_paths = re.findall( r'(?s)
  • ]+>.*?[0-9]+)/' _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', 'info_dict': { 'id': '297', 'ext': 'mp4', 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', 'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.' } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( r'

    (.*?)

    ', webpage, 'title') video_info_dicts = re.findall( r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage) formats = [] for video_info in video_info_dicts: video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) quality = video_info['quality'] video_url = video_info['src'] formats.append({ 'url': video_url, 'preference': 10 if quality == 'high' else 0, 'format_note': quality, 'format_id': '%s-%s' % (quality, determine_ext(video_url)), }) self._sort_formats(formats) return { 'id': video_id, 'formats': formats, 'title': title, 'description': self._og_search_description(webpage), } youtube-dl/youtube_dl/extractor/plays.py0000644000000000000000000000341512657443441017552 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import int_or_none class PlaysTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?plays\.tv/video/(?P[0-9a-f]{18})' _TEST = { 'url': 'http://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', 'md5': 'dfeac1198506652b5257a62762cec7bc', 'info_dict': { 'id': '56af17f56c95335490', 'ext': 'mp4', 'title': 'When you outplay the Azir wall', 'description': 'Posted by Bjergsen', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) content = self._parse_json( self._search_regex( r'R\.bindContent\(({.+?})\);', webpage, 'content'), video_id)['content'] mpd_url, sources = re.search( r'(?s)]+data-mpd="([^"]+)"[^>]*>(.+?)', content).groups() formats = self._extract_mpd_formats( self._proto_relative_url(mpd_url), video_id, mpd_id='DASH') for format_id, height, format_url in re.findall(r'\d+)' _TESTS = [{ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', 'md5': '18a525a510f942ada2720db5f31644c0', 'info_dict': { 'id': '100000002847155', 'ext': 'mov', 'title': 'Verbatim: What Is a Photocopier?', 'description': 'md5:93603dada88ddbda9395632fdc5da260', 'timestamp': 1398631707, 'upload_date': '20140427', 'uploader': 'Brett Weiner', 'duration': 419, } }, { 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) return self._extract_video_from_id(video_id) class NYTimesArticleIE(NYTimesBaseIE): _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?[^.]+)(?:\.html)?' _TESTS = [{ 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0', 'md5': 'e2076d58b4da18e6a001d53fd56db3c9', 'info_dict': { 'id': '100000003628438', 'ext': 'mov', 'title': 'New Minimum Wage: $70,000 a Year', 'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.', 'timestamp': 1429033037, 'upload_date': '20150414', 'uploader': 'Matthew Williams', } }, { 'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_id = self._html_search_regex(r'data-videoid="(\d+)"', webpage, 'video id') return self._extract_video_from_id(video_id) youtube-dl/youtube_dl/extractor/normalboots.py0000644000000000000000000000445612641030331020746 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( unified_strdate, ) class NormalbootsIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P[0-9a-z-]*)/?$' _TEST = { 'url': 'http://normalboots.com/video/home-alone-games-jontron/', 'md5': '8bf6de238915dd501105b44ef5f1e0f6', 'info_dict': { 'id': 'home-alone-games-jontron', 'ext': 'mp4', 'title': 'Home Alone Games - JonTron - NormalBoots', 'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/', 'uploader': 'JonTron', 'upload_date': '20140125', }, 'params': { # rtmp download 'skip_download': True, }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_uploader = self._html_search_regex( r'Posted\sby\s(?P[A-Za-z]*)\s
    ', webpage, 'uploader', fatal=False) video_upload_date = unified_strdate(self._html_search_regex( r'[A-Za-z]+, (?P.*)', webpage, 'date', fatal=False)) player_url = self._html_search_regex( r'[\S]+)"', webpage, 'player url') player_page = self._download_webpage(player_url, video_id) video_url = self._html_search_regex( r"file:\s'(?P[^']+\.mp4)'", player_page, 'file') return { 'id': video_id, 'url': video_url, 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'uploader': video_uploader, 'upload_date': video_upload_date, } youtube-dl/youtube_dl/extractor/mwave.py0000644000000000000000000000377612641030331017532 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, parse_duration, ) class MwaveIE(InfoExtractor): _VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P[0-9]+)' _TEST = { 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859', 'md5': 'c930e27b7720aaa3c9d0018dfc8ff6cc', 'info_dict': { 'id': '168859', 'ext': 'flv', 'title': '[M COUNTDOWN] SISTAR - SHAKE IT', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'M COUNTDOWN', 'duration': 206, 'view_count': int, } } def _real_extract(self, url): video_id = self._match_id(url) vod_info = self._download_json( 'http://mwave.interest.me/onair/vod_info.m?vodtype=CL§orid=&endinfo=Y&id=%s' % video_id, video_id, 'Download vod JSON') formats = [] for num, cdn_info in enumerate(vod_info['cdn']): stream_url = cdn_info.get('url') if not stream_url: continue stream_name = cdn_info.get('name') or compat_str(num) f4m_stream = self._download_json( stream_url, video_id, 'Download %s stream JSON' % stream_name) f4m_url = f4m_stream.get('fileurl') if not f4m_url: continue formats.extend( self._extract_f4m_formats(f4m_url + '&hdcore=3.0.3', video_id, f4m_id=stream_name)) self._sort_formats(formats) return { 'id': video_id, 'title': vod_info['title'], 'thumbnail': vod_info.get('cover'), 'uploader': vod_info.get('program_title'), 'duration': parse_duration(vod_info.get('time')), 'view_count': int_or_none(vod_info.get('hit')), 'formats': formats, } youtube-dl/youtube_dl/extractor/noco.py0000644000000000000000000002042612641030331017340 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re import time import hashlib from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, compat_urlparse, ) from ..utils import ( clean_html, ExtractorError, int_or_none, float_or_none, parse_iso8601, sanitized_Request, ) class NocoIE(InfoExtractor): _VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P\d+)' _LOGIN_URL = 'http://noco.tv/do.php' _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s' _SUB_LANG_TEMPLATE = '&sub_lang=%s' _NETRC_MACHINE = 'noco' _TESTS = [ { 'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/', 'md5': '0a993f0058ddbcd902630b2047ef710e', 'info_dict': { 'id': '11538', 'ext': 'mp4', 'title': 'Ami Ami Idol - Hello! France', 'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86', 'upload_date': '20140412', 'uploader': 'Nolife', 'uploader_id': 'NOL', 'duration': 2851.2, }, 'skip': 'Requires noco account', }, { 'url': 'http://noco.tv/emission/12610/lbl42/the-guild/s01e01-wake-up-call', 'md5': 'c190f1f48e313c55838f1f412225934d', 'info_dict': { 'id': '12610', 'ext': 'mp4', 'title': 'The Guild #1 - Wake-Up Call', 'timestamp': 1403863200, 'upload_date': '20140627', 'uploader': 'LBL42', 'uploader_id': 'LBL', 'duration': 233.023, }, 'skip': 'Requires noco account', } ] def _real_initialize(self): self._login() def _login(self): (username, password) = self._get_login_info() if username is None: return login_form = { 'a': 'login', 'cookie': '1', 'username': username, 'password': password, } request = sanitized_Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') login = self._download_json(request, None, 'Logging in as %s' % username) if 'erreur' in login: raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True) @staticmethod def _ts(): return int(time.time() * 1000) def _call_api(self, path, video_id, note, sub_lang=None): ts = compat_str(self._ts() + self._ts_offset) tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest() url = self._API_URL_TEMPLATE % (path, ts, tk) if sub_lang: url += self._SUB_LANG_TEMPLATE % sub_lang request = sanitized_Request(url) request.add_header('Referer', self._referer) resp = self._download_json(request, video_id, note) if isinstance(resp, dict) and resp.get('error'): self._raise_error(resp['error'], resp['description']) return resp def _raise_error(self, error, description): raise ExtractorError( '%s returned error: %s - %s' % (self.IE_NAME, error, description), expected=True) def _real_extract(self, url): video_id = self._match_id(url) # Timestamp adjustment offset between server time and local time # must be calculated in order to use timestamps closest to server's # in all API requests (see https://github.com/rg3/youtube-dl/issues/7864) webpage = self._download_webpage(url, video_id) player_url = self._search_regex( r'(["\'])(?Phttps?://noco\.tv/(?:[^/]+/)+NocoPlayer.+?\.swf.*?)\1', webpage, 'noco player', group='player', default='http://noco.tv/cdata/js/player/NocoPlayer-v1.2.40.swf') qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query) ts = int_or_none(qs.get('ts', [None])[0]) self._ts_offset = ts - self._ts() if ts else 0 self._referer = player_url medias = self._call_api( 'shows/%s/medias' % video_id, video_id, 'Downloading video JSON') show = self._call_api( 'shows/by_id/%s' % video_id, video_id, 'Downloading show JSON')[0] options = self._call_api( 'users/init', video_id, 'Downloading user options JSON')['options'] audio_lang_pref = options.get('audio_language') or options.get('language', 'fr') if audio_lang_pref == 'original': audio_lang_pref = show['original_lang'] if len(medias) == 1: audio_lang_pref = list(medias.keys())[0] elif audio_lang_pref not in medias: audio_lang_pref = 'fr' qualities = self._call_api( 'qualities', video_id, 'Downloading qualities JSON') formats = [] for audio_lang, audio_lang_dict in medias.items(): preference = 1 if audio_lang == audio_lang_pref else 0 for sub_lang, lang_dict in audio_lang_dict['video_list'].items(): for format_id, fmt in lang_dict['quality_list'].items(): format_id_extended = 'audio-%s_sub-%s_%s' % (audio_lang, sub_lang, format_id) video = self._call_api( 'shows/%s/video/%s/%s' % (video_id, format_id.lower(), audio_lang), video_id, 'Downloading %s video JSON' % format_id_extended, sub_lang if sub_lang != 'none' else None) file_url = video['file'] if not file_url: continue if file_url in ['forbidden', 'not found']: popmessage = video['popmessage'] self._raise_error(popmessage['title'], popmessage['message']) formats.append({ 'url': file_url, 'format_id': format_id_extended, 'width': int_or_none(fmt.get('res_width')), 'height': int_or_none(fmt.get('res_lines')), 'abr': int_or_none(fmt.get('audiobitrate'), 1000), 'vbr': int_or_none(fmt.get('videobitrate'), 1000), 'filesize': int_or_none(fmt.get('filesize')), 'format_note': qualities[format_id].get('quality_name'), 'quality': qualities[format_id].get('priority'), 'preference': preference, }) self._sort_formats(formats) timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ') if timestamp is not None and timestamp < 0: timestamp = None uploader = show.get('partner_name') uploader_id = show.get('partner_key') duration = float_or_none(show.get('duration_ms'), 1000) thumbnails = [] for thumbnail_key, thumbnail_url in show.items(): m = re.search(r'^screenshot_(?P\d+)x(?P\d+)$', thumbnail_key) if not m: continue thumbnails.append({ 'url': thumbnail_url, 'width': int(m.group('width')), 'height': int(m.group('height')), }) episode = show.get('show_TT') or show.get('show_OT') family = show.get('family_TT') or show.get('family_OT') episode_number = show.get('episode_number') title = '' if family: title += family if episode_number: title += ' #' + compat_str(episode_number) if episode: title += ' - ' + compat_str(episode) description = show.get('show_resume') or show.get('family_resume') return { 'id': video_id, 'title': title, 'description': description, 'thumbnails': thumbnails, 'timestamp': timestamp, 'uploader': uploader, 'uploader_id': uploader_id, 'duration': duration, 'formats': formats, } youtube-dl/youtube_dl/extractor/streamcloud.py0000644000000000000000000000357512641030331020732 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import sanitized_Request class StreamcloudIE(InfoExtractor): IE_NAME = 'streamcloud.eu' _VALID_URL = r'https?://streamcloud\.eu/(?P[a-zA-Z0-9_-]+)(?:/(?P[^#?]*)\.html)?' _TEST = { 'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html', 'md5': '6bea4c7fa5daaacc2a946b7146286686', 'info_dict': { 'id': 'skp9j99s4bpz', 'ext': 'mp4', 'title': 'youtube-dl test video \'/\\ ä ↭', }, 'skip': 'Only available from the EU' } def _real_extract(self, url): video_id = self._match_id(url) url = 'http://streamcloud.eu/%s' % video_id orig_webpage = self._download_webpage(url, video_id) fields = re.findall(r'''(?x)]*>([^<]+)<', webpage, 'title') video_url = self._search_regex( r'file:\s*"([^"]+)"', webpage, 'video URL') thumbnail = self._search_regex( r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False) return { 'id': video_id, 'title': title, 'url': video_url, 'thumbnail': thumbnail, } youtube-dl/youtube_dl/extractor/ign.py0000644000000000000000000002112512641247326017172 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, ) class IGNIE(InfoExtractor): """ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Some videos of it.ign.com are also supported """ _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?Pvideos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P.+)' IE_NAME = 'ign.com' _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' _EMBED_RE = r']+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']' _TESTS = [ { 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', 'md5': 'febda82c4bafecd2d44b6e1a18a595f8', 'info_dict': { 'id': '8f862beef863986b2785559b9e1aa599', 'ext': 'mp4', 'title': 'The Last of Us Review', 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', 'timestamp': 1370440800, 'upload_date': '20130605', 'uploader_id': 'cberidon@ign.com', } }, { 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { 'id': '100-little-things-in-gta-5-that-will-blow-your-mind', }, 'playlist': [ { 'info_dict': { 'id': '5ebbd138523268b93c9141af17bec937', 'ext': 'mp4', 'title': 'GTA 5 Video Review', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', 'timestamp': 1379339880, 'upload_date': '20130916', 'uploader_id': 'danieljkrupa@gmail.com', }, }, { 'info_dict': { 'id': '638672ee848ae4ff108df2a296418ee2', 'ext': 'mp4', 'title': '26 Twisted Moments from GTA 5 in Slow Motion', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', 'timestamp': 1386878820, 'upload_date': '20131212', 'uploader_id': 'togilvie@ign.com', }, }, ], 'params': { 'skip_download': True, }, }, { 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'md5': '618fedb9c901fd086f6f093564ef8558', 'info_dict': { 'id': '078fdd005f6d3c02f63d795faa1b984f', 'ext': 'mp4', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', 'timestamp': 1408047180, 'upload_date': '20140814', 'uploader_id': 'jamesduggan1990@gmail.com', }, }, { 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', 'only_matching': True, }, { 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', 'only_matching': True, }, ] def _find_video_id(self, webpage): res_id = [ r'"video_id"\s*:\s*"(.*?)"', r'class="hero-poster[^"]*?"[^>]*id="(.+?)"', r'data-video-id="(.+?)"', r']*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', webpage) if multiple_urls: entries = [self.url_result(u, ie='IGN') for u in multiple_urls] return { '_type': 'playlist', 'id': name_or_id, 'entries': entries, } video_id = self._find_video_id(webpage) if not video_id: return self.url_result(self._search_regex( self._EMBED_RE, webpage, 'embed url')) return self._get_video_info(video_id) def _get_video_info(self, video_id): api_data = self._download_json( self._API_URL_TEMPLATE % video_id, video_id) formats = [] m3u8_url = api_data['refs'].get('m3uUrl') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) f4m_url = api_data['refs'].get('f4mUrl') if f4m_url: formats.extend(self._extract_f4m_formats( f4m_url, video_id, f4m_id='hds', fatal=False)) for asset in api_data['assets']: formats.append({ 'url': asset['url'], 'tbr': asset.get('actual_bitrate_kbps'), 'fps': asset.get('frame_rate'), 'height': int_or_none(asset.get('height')), 'width': int_or_none(asset.get('width')), }) self._sort_formats(formats) thumbnails = [{ 'url': thumbnail['url'] } for thumbnail in api_data.get('thumbnails', [])] metadata = api_data['metadata'] return { 'id': api_data.get('videoId') or video_id, 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], 'description': metadata.get('description'), 'timestamp': parse_iso8601(metadata.get('publishDate')), 'duration': int_or_none(metadata.get('duration')), 'display_id': metadata.get('slug') or video_id, 'uploader_id': metadata.get('creator'), 'thumbnails': thumbnails, 'formats': formats, } class OneUPIE(IGNIE): _VALID_URL = r'https?://gamevideos\.1up\.com/(?Pvideo)/id/(?P.+)\.html' IE_NAME = '1up.com' _TESTS = [{ 'url': 'http://gamevideos.1up.com/video/id/34976.html', 'md5': 'c9cc69e07acb675c31a16719f909e347', 'info_dict': { 'id': '34976', 'ext': 'mp4', 'title': 'Sniper Elite V2 - Trailer', 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', 'timestamp': 1313099220, 'upload_date': '20110811', 'uploader_id': 'IGN', } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) result = super(OneUPIE, self)._real_extract(url) result['id'] = mobj.group('name_or_id') return result class PCMagIE(IGNIE): _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?Pvideos|article2)(/.+)?/(?P.+)' IE_NAME = 'pcmag' _EMBED_RE = r'iframe.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content.html?[^"]*url=([^"]+)["&]' _TESTS = [{ 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', 'md5': '212d6154fd0361a2781075f1febbe9ad', 'info_dict': { 'id': 'ee10d774b508c9b8ec07e763b9125b91', 'ext': 'mp4', 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', 'timestamp': 1420571160, 'upload_date': '20150106', 'uploader_id': 'cozzipix@gmail.com', } }, { 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', 'md5': '94130c1ca07ba0adb6088350681f16c1', 'info_dict': { 'id': '042e560ba94823d43afcb12ddf7142ca', 'ext': 'mp4', 'title': 'HTC\'s Weird New Re Camera - What\'s New Now', 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', 'timestamp': 1412953920, 'upload_date': '20141010', 'uploader_id': 'chris_snyder@pcmag.com', } }] youtube-dl/youtube_dl/extractor/radiojavan.py0000644000000000000000000000425212641030331020517 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import( unified_strdate, str_to_int, ) class RadioJavanIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P[^/]+)/?' _TEST = { 'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam', 'md5': 'e85208ffa3ca8b83534fca9fe19af95b', 'info_dict': { 'id': 'chaartaar-ashoobam', 'ext': 'mp4', 'title': 'Chaartaar - Ashoobam', 'thumbnail': 're:^https?://.*\.jpe?g$', 'upload_date': '20150215', 'view_count': int, 'like_count': int, 'dislike_count': int, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) formats = [{ 'url': 'https://media.rdjavan.com/media/music_video/%s' % video_path, 'format_id': '%sp' % height, 'height': int(height), } for height, video_path in re.findall(r"RJ\.video(\d+)p\s*=\s*'/?([^']+)'", webpage)] self._sort_formats(formats) title = self._og_search_title(webpage) thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._search_regex( r'class="date_added">Date added: ([^<]+)<', webpage, 'upload date', fatal=False)) view_count = str_to_int(self._search_regex( r'class="views">Plays: ([\d,]+)', webpage, 'view count', fatal=False)) like_count = str_to_int(self._search_regex( r'class="rating">([\d,]+) likes', webpage, 'like count', fatal=False)) dislike_count = str_to_int(self._search_regex( r'class="rating">([\d,]+) dislikes', webpage, 'dislike count', fatal=False)) return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'upload_date': upload_date, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'formats': formats, } youtube-dl/youtube_dl/extractor/testurl.py0000644000000000000000000000416312645665720020130 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ExtractorError class TestURLIE(InfoExtractor): """ Allows addressing of the test cases as test:yout.*be_1 """ IE_DESC = False # Do not list _VALID_URL = r'test(?:url)?:(?P(?P.+?)(?:_(?P[0-9]+))?)$' def _real_extract(self, url): from ..extractor import gen_extractors mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') extractor_id = mobj.group('extractor') all_extractors = gen_extractors() rex = re.compile(extractor_id, flags=re.IGNORECASE) matching_extractors = [ e for e in all_extractors if rex.search(e.IE_NAME)] if len(matching_extractors) == 0: raise ExtractorError( 'No extractors matching %r found' % extractor_id, expected=True) elif len(matching_extractors) > 1: # Is it obvious which one to pick? try: extractor = next( ie for ie in matching_extractors if ie.IE_NAME.lower() == extractor_id.lower()) except StopIteration: raise ExtractorError( ('Found multiple matching extractors: %s' % ' '.join(ie.IE_NAME for ie in matching_extractors)), expected=True) else: extractor = matching_extractors[0] num_str = mobj.group('num') num = int(num_str) if num_str else 0 testcases = [] t = getattr(extractor, '_TEST', None) if t: testcases.append(t) testcases.extend(getattr(extractor, '_TESTS', [])) try: tc = testcases[num] except IndexError: raise ExtractorError( ('Test case %d not found, got only %d tests' % (num, len(testcases))), expected=True) self.to_screen('Test URL: %s' % tc['url']) return { '_type': 'url', 'url': tc['url'], 'id': video_id, } youtube-dl/youtube_dl/extractor/newstube.py0000644000000000000000000000672112641030331020240 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ExtractorError class NewstubeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P.+)' _TEST = { 'url': 'http://www.newstube.ru/media/telekanal-cnn-peremestil-gorod-slavyansk-v-krym', 'info_dict': { 'id': '728e0ef2-e187-4012-bac0-5a081fdcb1f6', 'ext': 'flv', 'title': 'Телеканал CNN переместил город Славянск в Крым', 'description': 'md5:419a8c9f03442bc0b0a794d689360335', 'duration': 31.05, }, 'params': { # rtmp download 'skip_download': True, }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') page = self._download_webpage(url, video_id, 'Downloading page') video_guid = self._html_search_regex( r'UTV - (.*?)[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' _TESTS = [{ 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html', 'playlist': [ { 'md5': '6e5adc1e28253bbb1b28ab05403dd4d4', 'info_dict': { 'id': 'be6a17b0-412d-11e5-8ff7-0026b9414f30', 'ext': 'mp4', 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S1', 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', } }, { 'md5': 'd7be441fc53a1d4882fa9508a1e5b3ce', 'info_dict': { 'id': 'be6b8f96-412d-11e5-8ff7-0026b9414f30', 'ext': 'mp4', 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S2', 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', } }, { 'md5': 'efffe1728a234b2b0d2f2b343dd1946f', 'info_dict': { 'id': 'be6cf7e6-412d-11e5-8ff7-0026b9414f30', 'ext': 'mp4', 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S3', 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', } }, { 'md5': '1ec6690733ab9f41709e274a1d5c7556', 'info_dict': { 'id': 'be6e3354-412d-11e5-8ff7-0026b9414f30', 'ext': 'mp4', 'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S4', 'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.', } }, ], }] def _get_feed_query(self, uri): return compat_urllib_parse.urlencode({ 'feed': 'nick_arc_player_prime', 'mgid': uri, }) def _extract_mgid(self, webpage): return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') youtube-dl/youtube_dl/extractor/southpark.py0000644000000000000000000000551712641030331020426 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.cc\.com/(?:clips|full-episodes)/(?P.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' _TESTS = [{ 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', 'info_dict': { 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', 'ext': 'mp4', 'title': 'South Park|Bat Daded', 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', }, }] class SouthParkEsIE(SouthParkIE): IE_NAME = 'southpark.cc.com:español' _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.cc\.com/episodios-en-espanol/(?P.+?)(\?|#|$))' _LANG = 'es' _TESTS = [{ 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', 'playlist_count': 4, }] class SouthParkDeIE(SouthParkIE): IE_NAME = 'southpark.de' _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.de/(?:clips|alle-episoden)/(?P.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' _TESTS = [{ 'url': 'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured', 'info_dict': { 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', 'ext': 'mp4', 'title': 'The Government Won\'t Respect My Privacy', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', }, }, { # non-ASCII characters in initial URL 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', 'playlist_count': 4, }, { # non-ASCII characters in redirect URL 'url': 'http://www.southpark.de/alle-episoden/s18e09', 'playlist_count': 4, }] class SouthParkNlIE(SouthParkIE): IE_NAME = 'southpark.nl' _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.nl/(?:clips|full-episodes)/(?P.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' _TESTS = [{ 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', 'playlist_count': 4, }] class SouthParkDkIE(SouthParkIE): IE_NAME = 'southparkstudios.dk' _VALID_URL = r'https?://(?:www\.)?(?Psouthparkstudios\.dk/(?:clips|full-episodes)/(?P.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' _TESTS = [{ 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', 'playlist_count': 4, }] youtube-dl/youtube_dl/extractor/krasview.py0000644000000000000000000000364612641030331020242 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import json from .common import InfoExtractor from ..utils import ( int_or_none, js_to_json, ) class KrasViewIE(InfoExtractor): IE_DESC = 'Красвью' _VALID_URL = r'https?://krasview\.ru/(?:video|embed)/(?P\d+)' _TEST = { 'url': 'http://krasview.ru/video/512228', 'md5': '3b91003cf85fc5db277870c8ebd98eae', 'info_dict': { 'id': '512228', 'ext': 'mp4', 'title': 'Снег, лёд, заносы', 'description': 'Снято в городе Нягань, в Ханты-Мансийском автономном округе.', 'duration': 27, 'thumbnail': 're:^https?://.*\.jpg', }, 'params': { 'skip_download': 'Not accessible from Travis CI server', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) flashvars = json.loads(js_to_json(self._search_regex( r'video_Init\(({.+?})', webpage, 'flashvars'))) video_url = flashvars['url'] title = self._og_search_title(webpage) description = self._og_search_description(webpage, default=None) thumbnail = flashvars.get('image') or self._og_search_thumbnail(webpage) duration = int_or_none(flashvars.get('duration')) width = int_or_none(self._og_search_property( 'video:width', webpage, 'video width', default=None)) height = int_or_none(self._og_search_property( 'video:height', webpage, 'video height', default=None)) return { 'id': video_id, 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'width': width, 'height': height, } youtube-dl/youtube_dl/extractor/tf1.py0000644000000000000000000000423112662564617017116 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P.+?)\.html' _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { 'id': '10635995', 'ext': 'mp4', 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle', 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', }, 'params': { # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, }, { 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', 'info_dict': { 'id': 'le-grand-mysterioso-chuggington-7085291-739', 'ext': 'mp4', 'title': 'Le grand Mystérioso - Chuggington', 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', 'upload_date': '20150103', }, 'params': { # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, 'skip': 'HTTP Error 410: Gone', }, { 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', 'only_matching': True, }, { 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', 'only_matching': True, }, { 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) wat_id = self._html_search_regex( r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') youtube-dl/youtube_dl/extractor/howstuffworks.py0000644000000000000000000001046412641030331021336 0ustar rootrootfrom __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( find_xpath_attr, int_or_none, js_to_json, unescapeHTML, ) class HowStuffWorksIE(InfoExtractor): _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*(?:\d+-)?(?P.+?)-video\.htm' _TESTS = [ { 'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm', 'info_dict': { 'id': '450221', 'ext': 'flv', 'title': 'Cool Jobs - Iditarod Musher', 'description': 'Cold sleds, freezing temps and warm dog breath... an Iditarod musher\'s dream. Kasey-Dee Gardner jumps on a sled to find out what the big deal is.', 'display_id': 'cool-jobs-iditarod-musher', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 161, }, }, { 'url': 'http://adventure.howstuffworks.com/7199-survival-zone-food-and-water-in-the-savanna-video.htm', 'info_dict': { 'id': '453464', 'ext': 'mp4', 'title': 'Survival Zone: Food and Water In the Savanna', 'description': 'Learn how to find both food and water while trekking in the African savannah. In this video from the Discovery Channel.', 'display_id': 'survival-zone-food-and-water-in-the-savanna', 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm', 'info_dict': { 'id': '440011', 'ext': 'flv', 'title': 'Sword Swallowing #1 by Dan Meyer', 'description': 'Video footage (1 of 3) used by permission of the owner Dan Meyer through Sword Swallowers Association International ', 'display_id': 'sword-swallowing-1-by-dan-meyer', 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://shows.howstuffworks.com/stuff-to-blow-your-mind/optical-illusions-video.htm', 'only_matching': True, } ] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) clip_js = self._search_regex( r'(?s)var clip = ({.*?});', webpage, 'clip info') clip_info = self._parse_json( clip_js, display_id, transform_source=js_to_json) video_id = clip_info['content_id'] formats = [] m3u8_url = clip_info.get('m3u8') if m3u8_url: formats += self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') for video in clip_info.get('mp4', []): formats.append({ 'url': video['src'], 'format_id': video['bitrate'], 'vbr': int(video['bitrate'].rstrip('k')), }) if not formats: smil = self._download_xml( 'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % video_id, video_id, 'Downloading video SMIL') http_base = find_xpath_attr( smil, './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'), 'name', 'httpBase').get('content') URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=A&g=A' for video in smil.findall( './{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')): vbr = int_or_none(video.attrib['system-bitrate'], scale=1000) formats.append({ 'url': '%s/%s%s' % (http_base, video.attrib['src'], URL_SUFFIX), 'format_id': '%dk' % vbr, 'vbr': vbr, }) self._sort_formats(formats) return { 'id': '%s' % video_id, 'display_id': display_id, 'title': unescapeHTML(clip_info['clip_title']), 'description': unescapeHTML(clip_info.get('caption')), 'thumbnail': clip_info.get('video_still_url'), 'duration': clip_info.get('duration'), 'formats': formats, } youtube-dl/youtube_dl/extractor/anysex.py0000644000000000000000000000404512641030331017710 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( parse_duration, int_or_none, ) class AnySexIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?anysex\.com/(?P\d+)' _TEST = { 'url': 'http://anysex.com/156592/', 'md5': '023e9fbb7f7987f5529a394c34ad3d3d', 'info_dict': { 'id': '156592', 'ext': 'mp4', 'title': 'Busty and sexy blondie in her bikini strips for you', 'description': 'md5:de9e418178e2931c10b62966474e1383', 'categories': ['Erotic'], 'duration': 270, 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) video_url = self._html_search_regex(r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') title = self._html_search_regex(r'(.*?)', webpage, 'title') description = self._html_search_regex( r'
    ]*>([^<]+)
    ', webpage, 'description', fatal=False) thumbnail = self._html_search_regex( r'preview_url\s*:\s*\'(.*?)\'', webpage, 'thumbnail', fatal=False) categories = re.findall( r'([^<]+)', webpage) duration = parse_duration(self._search_regex( r'Duration: (?:)?(\d+:\d+)', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( r'Views: (\d+)', webpage, 'view count', fatal=False)) return { 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': title, 'description': description, 'thumbnail': thumbnail, 'categories': categories, 'duration': duration, 'view_count': view_count, 'age_limit': 18, } youtube-dl/youtube_dl/extractor/tv2.py0000644000000000000000000001112212653373215017124 0ustar rootroot# encoding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, float_or_none, parse_iso8601, remove_end, ) class TV2IE(InfoExtractor): _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P\d+)' _TEST = { 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { 'id': '916509', 'ext': 'mp4', 'title': 'Se Frode Gryttens hyllest av Steven Gerrard', 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', 'timestamp': 1431715610, 'upload_date': '20150515', 'duration': 156.967, 'view_count': int, 'categories': list, }, 'params': { # m3u8 download 'skip_download': True, }, } def _real_extract(self, url): video_id = self._match_id(url) formats = [] format_urls = [] for protocol in ('HDS', 'HLS'): data = self._download_json( 'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol), video_id, 'Downloading play JSON')['playback'] for item in data['items']['item']: video_url = item.get('url') if not video_url or video_url in format_urls: continue format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat')) if not self._is_valid_url(video_url, video_id, format_id): continue format_urls.append(video_url) ext = determine_ext(video_url) if ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id=format_id)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', m3u8_id=format_id)) elif ext == 'ism' or video_url.endswith('.ism/Manifest'): pass else: formats.append({ 'url': video_url, 'format_id': format_id, 'tbr': int_or_none(item.get('bitrate')), 'filesize': int_or_none(item.get('fileSize')), }) self._sort_formats(formats) asset = self._download_json( 'http://sumo.tv2.no/api/web/asset/%s.json' % video_id, video_id, 'Downloading metadata JSON')['asset'] title = asset['title'] description = asset.get('description') timestamp = parse_iso8601(asset.get('createTime')) duration = float_or_none(asset.get('accurateDuration') or asset.get('duration')) view_count = int_or_none(asset.get('views')) categories = asset.get('keywords', '').split(',') thumbnails = [{ 'id': thumbnail.get('@type'), 'url': thumbnail.get('url'), } for _, thumbnail in asset.get('imageVersions', {}).items()] return { 'id': video_id, 'url': video_url, 'title': title, 'description': description, 'thumbnails': thumbnails, 'timestamp': timestamp, 'duration': duration, 'view_count': view_count, 'categories': categories, 'formats': formats, } class TV2ArticleIE(InfoExtractor): _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P\d+)' _TESTS = [{ 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', 'info_dict': { 'id': '6930542', 'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret', 'description': 'md5:339573779d3eea3542ffe12006190954', }, 'playlist_count': 2, }, { 'url': 'http://www.tv2.no/a/6930542', 'only_matching': True, }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) entries = [ self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2') for video_id in re.findall(r'data-assetid="(\d+)"', webpage)] title = remove_end(self._og_search_title(webpage), ' - TV2.no') description = remove_end(self._og_search_description(webpage), ' - TV2.no') return self.playlist_result(entries, playlist_id, title, description) youtube-dl/youtube_dl/extractor/playtvak.py0000644000000000000000000001517312641030331020240 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_urlparse, compat_urllib_parse, ) from ..utils import ( ExtractorError, int_or_none, parse_iso8601, qualities, ) class PlaytvakIE(InfoExtractor): IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz' _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P[^&]+)' _TESTS = [{ 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko', 'md5': '4525ae312c324b4be2f4603cc78ceb4a', 'info_dict': { 'id': 'A150730_150323_hodinovy-manzel_kuko', 'ext': 'mp4', 'title': 'Vyžeňte vosy a sršně ze zahrady', 'description': 'md5:f93d398691044d303bc4a3de62f3e976', 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', 'duration': 279, 'timestamp': 1438732860, 'upload_date': '20150805', 'is_live': False, } }, { # live video test 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat', 'info_dict': { 'id': 'A150624_164934_planespotting_cat', 'ext': 'flv', 'title': 're:^Přímý přenos iDNES.cz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', 'is_live': True, }, 'params': { 'skip_download': True, # requires rtmpdump }, }, { # idnes.cz 'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku', 'md5': '819832ba33cd7016e58a6658577fe289', 'info_dict': { 'id': 'A150809_104116_domaci_pku', 'ext': 'mp4', 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se', 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2', 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', 'duration': 39, 'timestamp': 1438969140, 'upload_date': '20150807', 'is_live': False, } }, { # lidovky.cz 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE', 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8', 'info_dict': { 'id': 'A150808_214044_ln-video_ELE', 'ext': 'mp4', 'title': 'Táhni! Demonstrace proti imigrantům budila emoce', 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c', 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', 'timestamp': 1439052180, 'upload_date': '20150808', 'is_live': False, } }, { # metro.cz 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row', 'md5': '84fc1deedcac37b7d4a6ccae7c716668', 'info_dict': { 'id': 'A141111_173251_metro-extra_row', 'ext': 'mp4', 'title': 'Recesisté udělali z billboardu kolotoč', 'description': 'md5:7369926049588c3989a66c9c1a043c4c', 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', 'timestamp': 1415725500, 'upload_date': '20141111', 'is_live': False, } }, { 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) info_url = self._html_search_regex( r'Misc\.videoFLV\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') parsed_url = compat_urlparse.urlparse(info_url) qs = compat_urlparse.parse_qs(parsed_url.query) qs.update({ 'reklama': ['0'], 'type': ['js'], }) info_url = compat_urlparse.urlunparse( parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) json_info = self._download_json( info_url, video_id, transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) item = None for i in json_info['items']: if i.get('type') == 'video' or i.get('type') == 'stream': item = i break if not item: raise ExtractorError('No suitable stream found') quality = qualities(('low', 'middle', 'high')) formats = [] for fmt in item['video']: video_url = fmt.get('file') if not video_url: continue format_ = fmt['format'] format_id = '%s_%s' % (format_, fmt['quality']) preference = None if format_ in ('mp4', 'webm'): ext = format_ elif format_ == 'rtmp': ext = 'flv' elif format_ == 'apple': ext = 'mp4' # Some streams have mp3 audio which does not play # well with ffmpeg filter aac_adtstoasc preference = -1 elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests continue else: # Other formats not supported yet continue formats.append({ 'url': video_url, 'ext': ext, 'format_id': format_id, 'quality': quality(fmt.get('quality')), 'preference': preference, }) self._sort_formats(formats) title = item['title'] is_live = item['type'] == 'stream' if is_live: title = self._live_title(title) description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'description', webpage, 'description') timestamp = None duration = None if not is_live: duration = int_or_none(item.get('length')) timestamp = item.get('published') if timestamp: timestamp = parse_iso8601(timestamp[:-5]) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': item.get('image'), 'duration': duration, 'timestamp': timestamp, 'is_live': is_live, 'formats': formats, } youtube-dl/youtube_dl/extractor/ir90tv.py0000644000000000000000000000331212641030331017532 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import remove_start class Ir90TvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P[0-9]+)/.*' _TESTS = [{ 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', 'md5': '411dbd94891381960cb9e13daa47a869', 'info_dict': { 'id': '95719', 'ext': 'mp4', 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', 'thumbnail': 're:^https?://.*\.jpg$', } }, { 'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = remove_start(self._html_search_regex( r'([^<]+)', webpage, 'title'), '90tv.ir :: ') video_url = self._search_regex( r']+src="([^"]+)"', webpage, 'video url') thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url', fatal=False) return { 'url': video_url, 'id': video_id, 'title': title, 'video_url': video_url, 'thumbnail': thumbnail, } youtube-dl/youtube_dl/extractor/urort.py0000644000000000000000000000431012641030331017547 0ustar rootroot# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_urllib_parse, ) from ..utils import ( unified_strdate, ) class UrortIE(InfoExtractor): IE_DESC = 'NRK P3 Urørt' _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P[^/]+)$' _TEST = { 'url': 'https://urort.p3.no/#!/Band/Gerilja', 'md5': '5ed31a924be8a05e47812678a86e127b', 'info_dict': { 'id': '33124-24', 'ext': 'mp3', 'title': 'The Bomb', 'thumbnail': 're:^https?://.+\.jpg', 'uploader': 'Gerilja', 'uploader_id': 'Gerilja', 'upload_date': '20100323', }, 'params': { 'matchtitle': '^The Bomb$', # To test, we want just one video } } def _real_extract(self, url): playlist_id = self._match_id(url) fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id) json_url = 'http://urort.p3.no/breeze/urort/TrackDTOViews?$filter=%s&$orderby=Released%%20desc&$expand=Tags%%2CFiles' % fstr songs = self._download_json(json_url, playlist_id) entries = [] for s in songs: formats = [{ 'tbr': f.get('Quality'), 'ext': f['FileType'], 'format_id': '%s-%s' % (f['FileType'], f.get('Quality', '')), 'url': 'http://p3urort.blob.core.windows.net/tracks/%s' % f['FileRef'], 'preference': 3 if f['FileType'] == 'mp3' else 2, } for f in s['Files']] self._sort_formats(formats) e = { 'id': '%d-%s' % (s['BandId'], s['$id']), 'title': s['Title'], 'uploader_id': playlist_id, 'uploader': s.get('BandName', playlist_id), 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'], 'upload_date': unified_strdate(s.get('Released')), 'formats': formats, } entries.append(e) return { '_type': 'playlist', 'id': playlist_id, 'title': playlist_id, 'entries': entries, } youtube-dl/youtube_dl/extractor/sunporno.py0000644000000000000000000000452012641030331020262 0ustar rootrootfrom __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( parse_duration, int_or_none, qualities, determine_ext, ) class SunPornoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?sunporno\.com/videos/(?P\d+)' _TEST = { 'url': 'http://www.sunporno.com/videos/807778/', 'md5': '6457d3c165fd6de062b99ef6c2ff4c86', 'info_dict': { 'id': '807778', 'ext': 'flv', 'title': 'md5:0a400058e8105d39e35c35e7c5184164', 'description': 'md5:a31241990e1bd3a64e72ae99afb325fb', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 302, 'age_limit': 18, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( r'([^<]+)', webpage, 'title') description = self._html_search_meta( 'description', webpage, 'description') thumbnail = self._html_search_regex( r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( r'itemprop="duration">\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( r'class="views">(?: