beautifulsoup4-4.6.0/0000775000175000017500000000000013103623241016127 5ustar leonardrleonardr00000000000000beautifulsoup4-4.6.0/scripts/0000775000175000017500000000000013103623241017616 5ustar leonardrleonardr00000000000000beautifulsoup4-4.6.0/scripts/demonstrate_parser_differences.py0000644000175000017500000000564011721435166026444 0ustar leonardrleonardr00000000000000"""Demonstrate how different parsers parse the same markup. Beautiful Soup can use any of a number of different parsers. Every parser should behave more or less the same on valid markup, and Beautiful Soup's unit tests make sure this is the case. But every parser handles invalid markup differently. Even different versions of the same parser handle invalid markup differently. So instead of unit tests I've created this educational demonstration script. The file demonstration_markup.txt contains many lines of HTML. This script tests each line of markup against every parser you have installed, and prints out how each parser sees that markup. This may help you choose a parser, or understand why Beautiful Soup presents your document the way it does. """ import os import sys from bs4 import BeautifulSoup parsers = ['html.parser'] try: from bs4.builder import _lxml parsers.append('lxml') except ImportError, e: pass try: from bs4.builder import _html5lib parsers.append('html5lib') except ImportError, e: pass class Demonstration(object): def __init__(self, markup): self.results = {} self.markup = markup def run_against(self, *parser_names): uniform_results = True previous_output = None for parser in parser_names: try: soup = BeautifulSoup(self.markup, parser) if markup.startswith("
"): # Extract the interesting part output = soup.div else: output = soup except Exception, e: output = "[EXCEPTION] %s" % str(e) self.results[parser] = output if previous_output is None: previous_output = output elif previous_output != output: uniform_results = False return uniform_results def dump(self): print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) for parser, output in self.results.items(): print "%s: %s" % (parser.rjust(13), output.encode("utf8")) different_results = [] uniform_results = [] print "= Testing the following parsers: %s =" % ", ".join(parsers) print input_file = sys.stdin if sys.stdin.isatty(): for filename in [ "demonstration_markup.txt", os.path.join("scripts", "demonstration_markup.txt")]: if os.path.exists(filename): input_file = open(filename) for markup in input_file: demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) is_uniform = demo.run_against(*parsers) if is_uniform: uniform_results.append(demo) else: different_results.append(demo) print "== Markup that's handled the same in every parser ==" print for demo in uniform_results: demo.dump() print print "== Markup that's not handled the same in every parser ==" print for demo in different_results: demo.dump() print beautifulsoup4-4.6.0/scripts/demonstration_markup.txt0000644000175000017500000000506411721435166024642 0ustar leonardrleonardr00000000000000A bare string
HTML5 does allow CDATA sections in SVG
A tag
A
tag that supposedly has contents.
AT&T
This numeric entity is missing the final semicolon:
, that attribute value was closed by the subsequent tag
a
This document contains (do you see it?)
This document ends with That attribute value was bogus
The doctype is invalid because it contains extra whitespace
That boolean attribute had no value
Here's a nonexistent entity: &#foo; (do you see it?)
This document ends before the entity finishes: >

Paragraphs shouldn't contain block display elements, but this one does:

you see?

Multiple values for the same attribute.
Here's a table
This tag contains nothing but whitespace:

This p tag is cut off by

the end of the blockquote tag
Here's a nested table:
foo
This table contains bare markup
This document contains a surprise doctype
Tag name contains Unicode characters
beautifulsoup4-4.6.0/PKG-INFO0000664000175000017500000000173713103623241017234 0ustar leonardrleonardr00000000000000Metadata-Version: 1.1 Name: beautifulsoup4 Version: 4.6.0 Summary: Screen-scraping library Home-page: http://www.crummy.com/software/BeautifulSoup/bs4/ Author: Leonard Richardson Author-email: leonardr@segfault.org License: MIT Download-URL: http://www.crummy.com/software/BeautifulSoup/bs4/download/ Description: Beautiful Soup sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree. Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Topic :: Text Processing :: Markup :: HTML Classifier: Topic :: Text Processing :: Markup :: XML Classifier: Topic :: Text Processing :: Markup :: SGML Classifier: Topic :: Software Development :: Libraries :: Python Modules beautifulsoup4-4.6.0/doc.zh/0000775000175000017500000000000013103623241017314 5ustar leonardrleonardr00000000000000beautifulsoup4-4.6.0/doc.zh/Makefile0000644000175000017500000001101612542753330020762 0ustar leonardrleonardr00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " text to make text files" @echo " man to make manual pages" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BeautifulSoup.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BeautifulSoup.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/BeautifulSoup" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BeautifulSoup" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." make -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." beautifulsoup4-4.6.0/doc.zh/source/0000775000175000017500000000000013103623241020614 5ustar leonardrleonardr00000000000000beautifulsoup4-4.6.0/doc.zh/source/conf.py0000644000175000017500000002001012542753424022117 0ustar leonardrleonardr00000000000000# -*- coding: utf-8 -*- # # Beautiful Soup documentation build configuration file, created by # sphinx-quickstart on Thu Jan 26 11:22:55 2012. # # This file is execfile()d with the current directory set to its containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import sys, os # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.insert(0, os.path.abspath('.')) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. #source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = u'Beautiful Soup' copyright = u'2012, Leonard Richardson' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = '4' # The full version, including alpha/beta/rc tags. release = '4.2.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. #language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = [] # The reST default role (used for this markup: `text`) to use for all documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = 'default' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. #html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_domain_indices = True # If false, no index is generated. #html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. #html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. #html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = 'BeautifulSoupdoc' # -- Options for LaTeX output -------------------------------------------------- # The paper size ('letter' or 'a4'). #latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). #latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'BeautifulSoup.tex', u'Beautiful Soup Documentation', u'Leonard Richardson', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # If true, show page references after internal links. #latex_show_pagerefs = False # If true, show URL addresses after external links. #latex_show_urls = False # Additional stuff for the LaTeX preamble. #latex_preamble = '' # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ ('index', 'beautifulsoup', u'Beautiful Soup Documentation', [u'Leonard Richardson'], 1) ] # -- Options for Epub output --------------------------------------------------- # Bibliographic Dublin Core info. epub_title = u'Beautiful Soup' epub_author = u'Leonard Richardson' epub_publisher = u'Leonard Richardson' epub_copyright = u'2012, Leonard Richardson' # The language of the text. It defaults to the language option # or en if the language is not set. #epub_language = '' # The scheme of the identifier. Typical schemes are ISBN or URL. #epub_scheme = '' # The unique identifier of the text. This can be a ISBN number # or the project homepage. #epub_identifier = '' # A unique identification for the text. #epub_uid = '' # HTML files that should be inserted before the pages created by sphinx. # The format is a list of tuples containing the path and title. #epub_pre_files = [] # HTML files shat should be inserted after the pages created by sphinx. # The format is a list of tuples containing the path and title. #epub_post_files = [] # A list of files that should not be packed into the epub file. #epub_exclude_files = [] # The depth of the table of contents in toc.ncx. #epub_tocdepth = 3 # Allow duplicate toc entries. #epub_tocdup = True beautifulsoup4-4.6.0/setup.py0000664000175000017500000000247513103622717017660 0ustar leonardrleonardr00000000000000from setuptools import ( setup, find_packages, ) setup( name="beautifulsoup4", version = "4.6.0", author="Leonard Richardson", author_email='leonardr@segfault.org', url="http://www.crummy.com/software/BeautifulSoup/bs4/", download_url = "http://www.crummy.com/software/BeautifulSoup/bs4/download/", description="Screen-scraping library", long_description="""Beautiful Soup sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.""", license="MIT", packages=find_packages(exclude=['tests*']), extras_require = { 'lxml' : [ 'lxml'], 'html5lib' : ['html5lib'], }, use_2to3 = True, classifiers=["Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python", "Programming Language :: Python :: 2.7", 'Programming Language :: Python :: 3', "Topic :: Text Processing :: Markup :: HTML", "Topic :: Text Processing :: Markup :: XML", "Topic :: Text Processing :: Markup :: SGML", "Topic :: Software Development :: Libraries :: Python Modules", ], ) beautifulsoup4-4.6.0/bs4/0000775000175000017500000000000013103623241016617 5ustar leonardrleonardr00000000000000beautifulsoup4-4.6.0/bs4/__init__.py0000644000175000017500000004770413103622642020746 0ustar leonardrleonardr00000000000000"""Beautiful Soup Elixir and Tonic "The Screen-Scraper's Friend" http://www.crummy.com/software/BeautifulSoup/ Beautiful Soup uses a pluggable XML or HTML parser to parse a (possibly invalid) document into a tree representation. Beautiful Soup provides methods and Pythonic idioms that make it easy to navigate, search, and modify the parse tree. Beautiful Soup works with Python 2.7 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "4.6.0" __copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] import os import re import traceback import warnings from .builder import builder_registry, ParserRejectedMarkup from .dammit import UnicodeDammit from .element import ( CData, Comment, DEFAULT_OUTPUT_ENCODING, Declaration, Doctype, NavigableString, PageElement, ProcessingInstruction, ResultSet, SoupStrainer, Tag, ) # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): """ This class defines the basic interface called by the tree builders. These methods will be called by the parser: reset() feed(markup) The tree builder may call these methods from its feed() implementation: handle_starttag(name, attrs) # See note about return value handle_endtag(name) handle_data(data) # Appends to the current data node endData(containerClass=NavigableString) # Ends the current data node No matter how complicated the underlying parser is, you should be able to build a tree using 'start tag' events, 'end tag' events, 'data' events, and "done with data" events. If you encounter an empty-element tag (aka a self-closing tag, like HTML's
tag), call handle_starttag and then handle_endtag. """ ROOT_TAG_NAME = u'[document]' # If the end-user gives no indication which tree builder they # want, look for one with these features. DEFAULT_BUILDER_FEATURES = ['html', 'fast'] ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n" def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" if 'convertEntities' in kwargs: warnings.warn( "BS4 does not respect the convertEntities argument to the " "BeautifulSoup constructor. Entities are always converted " "to Unicode characters.") if 'markupMassage' in kwargs: del kwargs['markupMassage'] warnings.warn( "BS4 does not respect the markupMassage argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for any necessary markup massage.") if 'smartQuotesTo' in kwargs: del kwargs['smartQuotesTo'] warnings.warn( "BS4 does not respect the smartQuotesTo argument to the " "BeautifulSoup constructor. Smart quotes are always converted " "to Unicode characters.") if 'selfClosingTags' in kwargs: del kwargs['selfClosingTags'] warnings.warn( "BS4 does not respect the selfClosingTags argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for understanding self-closing tags.") if 'isHTML' in kwargs: del kwargs['isHTML'] warnings.warn( "BS4 does not respect the isHTML argument to the " "BeautifulSoup constructor. Suggest you use " "features='lxml' for HTML and features='lxml-xml' for " "XML.") def deprecated_argument(old_name, new_name): if old_name in kwargs: warnings.warn( 'The "%s" argument to the BeautifulSoup constructor ' 'has been renamed to "%s."' % (old_name, new_name)) value = kwargs[old_name] del kwargs[old_name] return value return None parse_only = parse_only or deprecated_argument( "parseOnlyThese", "parse_only") from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") if from_encoding and isinstance(markup, unicode): warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") from_encoding = None if len(kwargs) > 0: arg = kwargs.keys().pop() raise TypeError( "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: original_features = features if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES builder_class = builder_registry.lookup(*features) if builder_class is None: raise FeatureNotFound( "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() if not (original_features == builder.NAME or original_features in builder.ALTERNATE_NAMES): if builder.is_xml: markup_type = "XML" else: markup_type = "HTML" caller = traceback.extract_stack()[0] filename = caller[0] line_number = caller[1] warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( filename=filename, line_number=line_number, parser=builder.NAME, markup_type=markup_type)) self.builder = builder self.is_xml = builder.is_xml self.known_xml = self.is_xml self.builder.soup = self self.parse_only = parse_only if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() elif len(markup) <= 256 and ( (isinstance(markup, bytes) and not b'<' in markup) or (isinstance(markup, unicode) and not u'<' in markup) ): # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. if (isinstance(markup, unicode) and not os.path.supports_unicode_filenames): possible_filename = markup.encode("utf8") else: possible_filename = markup is_file = False try: is_file = os.path.exists(possible_filename) except Exception, e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. pass if is_file: if isinstance(markup, unicode): markup = markup.encode("utf8") warnings.warn( '"%s" looks like a filename, not markup. You should' ' probably open this file and pass the filehandle into' ' Beautiful Soup.' % markup) self._check_markup_is_url(markup) for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( self.builder.prepare_markup( markup, from_encoding, exclude_encodings=exclude_encodings)): self.reset() try: self._feed() break except ParserRejectedMarkup: pass # Clear out the markup and remove the builder's circular # reference to this object. self.markup = None self.builder.soup = None def __copy__(self): copy = type(self)( self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' ) # Although we encoded the tree to UTF-8, that may not have # been the encoding of the original markup. Set the copy's # .original_encoding to reflect the original object's # .original_encoding. copy.original_encoding = self.original_encoding return copy def __getstate__(self): # Frequently a tree builder can't be pickled. d = dict(self.__dict__) if 'builder' in d and not self.builder.picklable: d['builder'] = None return d @staticmethod def _check_markup_is_url(markup): """ Check if markup looks like it's actually a url and raise a warning if so. Markup can be unicode or str (py2) / bytes (py3). """ if isinstance(markup, bytes): space = b' ' cant_start_with = (b"http:", b"https:") elif isinstance(markup, unicode): space = u' ' cant_start_with = (u"http:", u"https:") else: return if any(markup.startswith(prefix) for prefix in cant_start_with): if not space in markup: if isinstance(markup, bytes): decoded_markup = markup.decode('utf-8', 'replace') else: decoded_markup = markup warnings.warn( '"%s" looks like a URL. Beautiful Soup is not an' ' HTTP client. You should probably use an HTTP client like' ' requests to get the document behind the URL, and feed' ' that document to Beautiful Soup.' % decoded_markup ) def _feed(self): # Convert the document to Unicode. self.builder.reset() self.builder.feed(self.markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() def reset(self): Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) self.hidden = 1 self.builder.reset() self.current_data = [] self.currentTag = None self.tagStack = [] self.preserve_whitespace_tag_stack = [] self.pushTag(self) def new_tag(self, name, namespace=None, nsprefix=None, **attrs): """Create a new tag associated with this soup.""" return Tag(None, self.builder, name, namespace, nsprefix, attrs) def new_string(self, s, subclass=NavigableString): """Create a new NavigableString associated with this soup.""" return subclass(s) def insert_before(self, successor): raise NotImplementedError("BeautifulSoup objects don't support insert_before().") def insert_after(self, successor): raise NotImplementedError("BeautifulSoup objects don't support insert_after().") def popTag(self): tag = self.tagStack.pop() if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: self.preserve_whitespace_tag_stack.pop() #print "Pop", tag.name if self.tagStack: self.currentTag = self.tagStack[-1] return self.currentTag def pushTag(self, tag): #print "Push", tag.name if self.currentTag: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] if tag.name in self.builder.preserve_whitespace_tags: self.preserve_whitespace_tag_stack.append(tag) def endData(self, containerClass=NavigableString): if self.current_data: current_data = u''.join(self.current_data) # If whitespace is not preserved, and this string contains # nothing but ASCII spaces, replace it with a single space # or newline. if not self.preserve_whitespace_tag_stack: strippable = True for i in current_data: if i not in self.ASCII_SPACES: strippable = False break if strippable: if '\n' in current_data: current_data = '\n' else: current_data = ' ' # Reset the data collector. self.current_data = [] # Should we add this string to the tree at all? if self.parse_only and len(self.tagStack) <= 1 and \ (not self.parse_only.text or \ not self.parse_only.search(current_data)): return o = containerClass(current_data) self.object_was_parsed(o) def object_was_parsed(self, o, parent=None, most_recent_element=None): """Add an object to the parse tree.""" parent = parent or self.currentTag previous_element = most_recent_element or self._most_recent_element next_element = previous_sibling = next_sibling = None if isinstance(o, Tag): next_element = o.next_element next_sibling = o.next_sibling previous_sibling = o.previous_sibling if not previous_element: previous_element = o.previous_element o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) self._most_recent_element = o parent.contents.append(o) if parent.next_sibling: # This node is being inserted into an element that has # already been parsed. Deal with any dangling references. index = len(parent.contents)-1 while index >= 0: if parent.contents[index] is o: break index -= 1 else: raise ValueError( "Error building tree: supposedly %r was inserted " "into %r after the fact, but I don't see it!" % ( o, parent ) ) if index == 0: previous_element = parent previous_sibling = None else: previous_element = previous_sibling = parent.contents[index-1] if index == len(parent.contents)-1: next_element = parent.next_sibling next_sibling = None else: next_element = next_sibling = parent.contents[index+1] o.previous_element = previous_element if previous_element: previous_element.next_element = o o.next_element = next_element if next_element: next_element.previous_element = o o.next_sibling = next_sibling if next_sibling: next_sibling.previous_sibling = o o.previous_sibling = previous_sibling if previous_sibling: previous_sibling.next_sibling = o def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag stack up to but *not* including the most recent instqance of the given tag.""" #print "Popping to %s" % name if name == self.ROOT_TAG_NAME: # The BeautifulSoup object itself can never be popped. return most_recently_popped = None stack_size = len(self.tagStack) for i in range(stack_size - 1, 0, -1): t = self.tagStack[i] if (name == t.name and nsprefix == t.prefix): if inclusivePop: most_recently_popped = self.popTag() break most_recently_popped = self.popTag() return most_recently_popped def handle_starttag(self, name, namespace, nsprefix, attrs): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the SoupStrainer. You should proceed as if the tag had not occurred in the document. For instance, if this was a self-closing tag, don't call handle_endtag. """ # print "Start tag %s: %s" % (name, attrs) self.endData() if (self.parse_only and len(self.tagStack) <= 1 and (self.parse_only.text or not self.parse_only.search_tag(name, attrs))): return None tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, self.currentTag, self._most_recent_element) if tag is None: return tag if self._most_recent_element: self._most_recent_element.next_element = tag self._most_recent_element = tag self.pushTag(tag) return tag def handle_endtag(self, name, nsprefix=None): #print "End tag: " + name self.endData() self._popToTag(name, nsprefix) def handle_data(self, data): self.current_data.append(data) def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Returns a string or Unicode representation of this document. To get Unicode, pass None for encoding.""" if self.is_xml: # Print the XML declaration encoding_part = '' if eventual_encoding != None: encoding_part = ' encoding="%s"' % eventual_encoding prefix = u'\n' % encoding_part else: prefix = u'' if not pretty_print: indent_level = None else: indent_level = 0 return prefix + super(BeautifulSoup, self).decode( indent_level, eventual_encoding, formatter) # Alias to make it easier to type import: 'from bs4 import _soup' _s = BeautifulSoup _soup = BeautifulSoup class BeautifulStoneSoup(BeautifulSoup): """Deprecated interface to an XML parser.""" def __init__(self, *args, **kwargs): kwargs['features'] = 'xml' warnings.warn( 'The BeautifulStoneSoup class is deprecated. Instead of using ' 'it, pass features="xml" into the BeautifulSoup constructor.') super(BeautifulStoneSoup, self).__init__(*args, **kwargs) class StopParsing(Exception): pass class FeatureNotFound(ValueError): pass #By default, act as an HTML pretty-printer. if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) print soup.prettify() beautifulsoup4-4.6.0/bs4/diagnose.py0000664000175000017500000001513312746006631020777 0ustar leonardrleonardr00000000000000"""Diagnostic functions, mainly for use when doing tech support.""" # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. __license__ = "MIT" import cProfile from StringIO import StringIO from HTMLParser import HTMLParser import bs4 from bs4 import BeautifulSoup, __version__ from bs4.builder import builder_registry import os import pstats import random import tempfile import time import traceback import sys import cProfile def diagnose(data): """Diagnostic suite for isolating common problems.""" print "Diagnostic running on Beautiful Soup %s" % __version__ print "Python version %s" % sys.version basic_parsers = ["html.parser", "html5lib", "lxml"] for name in basic_parsers: for builder in builder_registry.builders: if name in builder.features: break else: basic_parsers.remove(name) print ( "I noticed that %s is not installed. Installing it may help." % name) if 'lxml' in basic_parsers: basic_parsers.append(["lxml", "xml"]) try: from lxml import etree print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) except ImportError, e: print ( "lxml is not installed or couldn't be imported.") if 'html5lib' in basic_parsers: try: import html5lib print "Found html5lib version %s" % html5lib.__version__ except ImportError, e: print ( "html5lib is not installed or couldn't be imported.") if hasattr(data, 'read'): data = data.read() elif os.path.exists(data): print '"%s" looks like a filename. Reading data from the file.' % data with open(data) as fp: data = fp.read() elif data.startswith("http:") or data.startswith("https:"): print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." return print for parser in basic_parsers: print "Trying to parse your markup with %s" % parser success = False try: soup = BeautifulSoup(data, parser) success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "Here's what %s did with the markup:" % parser print soup.prettify() print "-" * 80 def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. This lets you see how lxml parses a document when no Beautiful Soup code is running. """ from lxml import etree for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): print("%s, %4s, %s" % (event, element.tag, element.text)) class AnnouncingParser(HTMLParser): """Announces HTMLParser parse events, without doing anything else.""" def _p(self, s): print(s) def handle_starttag(self, name, attrs): self._p("%s START" % name) def handle_endtag(self, name): self._p("%s END" % name) def handle_data(self, data): self._p("%s DATA" % data) def handle_charref(self, name): self._p("%s CHARREF" % name) def handle_entityref(self, name): self._p("%s ENTITYREF" % name) def handle_comment(self, data): self._p("%s COMMENT" % data) def handle_decl(self, data): self._p("%s DECL" % data) def unknown_decl(self, data): self._p("%s UNKNOWN-DECL" % data) def handle_pi(self, data): self._p("%s PI" % data) def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data) _vowels = "aeiou" _consonants = "bcdfghjklmnpqrstvwxyz" def rword(length=5): "Generate a random word-like string." s = '' for i in range(length): if i % 2 == 0: t = _consonants else: t = _vowels s += random.choice(t) return s def rsentence(length=4): "Generate a random sentence-like string." return " ".join(rword(random.randint(4,9)) for i in range(length)) def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] elements = [] for i in range(num_elements): choice = random.randint(0,3) if choice == 0: # New tag. tag_name = random.choice(tag_names) elements.append("<%s>" % tag_name) elif choice == 1: elements.append(rsentence(random.randint(1,4))) elif choice == 2: # Close a tag. tag_name = random.choice(tag_names) elements.append("" % tag_name) return "" + "\n".join(elements) + "" def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) from lxml import etree a = time.time() etree.HTML(data) b = time.time() print "Raw lxml parsed the markup in %.2fs." % (b-a) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() print "Raw html5lib parsed the markup in %.2fs." % (b-a) def profile(num_elements=100000, parser="lxml"): filehandle = tempfile.NamedTemporaryFile() filename = filehandle.name data = rdoc(num_elements) vars = dict(bs4=bs4, data=data, parser=parser) cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) stats = pstats.Stats(filename) # stats.strip_dirs() stats.sort_stats("cumulative") stats.print_stats('_html5lib|bs4', 50) if __name__ == '__main__': diagnose(sys.stdin.read()) beautifulsoup4-4.6.0/bs4/element.py0000644000175000017500000020632513103607753020642 0ustar leonardrleonardr00000000000000# Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. __license__ = "MIT" import collections import re import shlex import sys import warnings from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) whitespace_re = re.compile("\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @property def alias(self): return getattr(self, attr) @alias.setter def alias(self): return setattr(self, attr) return alias class NamespacedAttribute(unicode): def __new__(cls, prefix, name, namespace=None): if name is None: obj = unicode.__new__(cls, prefix) elif prefix is None: # Not really namespaced. obj = unicode.__new__(cls, name) else: obj = unicode.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj class AttributeValueWithCharsetSubstitution(unicode): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'charset' attribute. When Beautiful Soup parses the markup '', the value of the 'charset' attribute will be one of these objects. """ def __new__(cls, original_value): obj = unicode.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): return encoding class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'content' attribute. When Beautiful Soup parses the markup: The value of the 'content' attribute will be one of these objects. """ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. return unicode.__new__(unicode, original_value) obj = unicode.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): def rewrite(match): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) class HTMLAwareEntitySubstitution(EntitySubstitution): """Entity substitution rules that are aware of some HTML quirks. Specifically, the contents of """) [soup.script.extract() for i in soup.find_all("script")] self.assertEqual("\n\n\n", unicode(soup.body)) def test_extract_works_when_element_is_surrounded_by_identical_strings(self): soup = self.soup( '\n' 'hi\n' '') soup.find('body').extract() self.assertEqual(None, soup.find('body')) def test_clear(self): """Tag.clear()""" soup = self.soup("

String Italicized and another

") # clear using extract() a = soup.a soup.p.clear() self.assertEqual(len(soup.p.contents), 0) self.assertTrue(hasattr(a, "contents")) # clear using decompose() em = a.em a.clear(decompose=True) self.assertEqual(0, len(em.contents)) def test_string_set(self): """Tag.string = 'string'""" soup = self.soup(" ") soup.a.string = "foo" self.assertEqual(soup.a.contents, ["foo"]) soup.b.string = "bar" self.assertEqual(soup.b.contents, ["bar"]) def test_string_set_does_not_affect_original_string(self): soup = self.soup("foobar") soup.b.string = soup.c.string self.assertEqual(soup.a.encode(), b"barbar") def test_set_string_preserves_class_of_string(self): soup = self.soup("") cdata = CData("foo") soup.a.string = cdata self.assertTrue(isinstance(soup.a.string, CData)) class TestElementObjects(SoupTest): """Test various features of element objects.""" def test_len(self): """The length of an element is its number of children.""" soup = self.soup("123") # The BeautifulSoup object itself contains one element: the # tag. self.assertEqual(len(soup.contents), 1) self.assertEqual(len(soup), 1) # The tag contains three elements: the text node "1", the # tag, and the text node "3". self.assertEqual(len(soup.top), 3) self.assertEqual(len(soup.top.contents), 3) def test_member_access_invokes_find(self): """Accessing a Python member .foo invokes find('foo')""" soup = self.soup('') self.assertEqual(soup.b, soup.find('b')) self.assertEqual(soup.b.i, soup.find('b').find('i')) self.assertEqual(soup.a, None) def test_deprecated_member_access(self): soup = self.soup('') with warnings.catch_warnings(record=True) as w: tag = soup.bTag self.assertEqual(soup.b, tag) self.assertEqual( '.bTag is deprecated, use .find("b") instead.', str(w[0].message)) def test_has_attr(self): """has_attr() checks for the presence of an attribute. Please note note: has_attr() is different from __in__. has_attr() checks the tag's attributes and __in__ checks the tag's chidlren. """ soup = self.soup("") self.assertTrue(soup.foo.has_attr('attr')) self.assertFalse(soup.foo.has_attr('attr2')) def test_attributes_come_out_in_alphabetical_order(self): markup = '' self.assertSoupEquals(markup, '') def test_string(self): # A tag that contains only a text node makes that node # available as .string. soup = self.soup("foo") self.assertEqual(soup.b.string, 'foo') def test_empty_tag_has_no_string(self): # A tag with no children has no .stirng. soup = self.soup("") self.assertEqual(soup.b.string, None) def test_tag_with_multiple_children_has_no_string(self): # A tag with no children has no .string. soup = self.soup("foo") self.assertEqual(soup.b.string, None) soup = self.soup("foobar
") self.assertEqual(soup.b.string, None) # Even if all the children are strings, due to trickery, # it won't work--but this would be a good optimization. soup = self.soup("foo") soup.a.insert(1, "bar") self.assertEqual(soup.a.string, None) def test_tag_with_recursive_string_has_string(self): # A tag with a single child which has a .string inherits that # .string. soup = self.soup("foo") self.assertEqual(soup.a.string, "foo") self.assertEqual(soup.string, "foo") def test_lack_of_string(self): """Only a tag containing a single text node has a .string.""" soup = self.soup("feo") self.assertFalse(soup.b.string) soup = self.soup("") self.assertFalse(soup.b.string) def test_all_text(self): """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" soup = self.soup("ar t ") self.assertEqual(soup.a.text, "ar t ") self.assertEqual(soup.a.get_text(strip=True), "art") self.assertEqual(soup.a.get_text(","), "a,r, , t ") self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") def test_get_text_ignores_comments(self): soup = self.soup("foobar") self.assertEqual(soup.get_text(), "foobar") self.assertEqual( soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") self.assertEqual( soup.get_text(types=None), "fooIGNOREbar") def test_all_strings_ignores_comments(self): soup = self.soup("foobar") self.assertEqual(['foo', 'bar'], list(soup.strings)) class TestCDAtaListAttributes(SoupTest): """Testing cdata-list attributes like 'class'. """ def test_single_value_becomes_list(self): soup = self.soup("") self.assertEqual(["foo"],soup.a['class']) def test_multiple_values_becomes_list(self): soup = self.soup("") self.assertEqual(["foo", "bar"], soup.a['class']) def test_multiple_values_separated_by_weird_whitespace(self): soup = self.soup("") self.assertEqual(["foo", "bar", "baz"],soup.a['class']) def test_attributes_joined_into_string_on_output(self): soup = self.soup("") self.assertEqual(b'', soup.a.encode()) def test_get_attribute_list(self): soup = self.soup("") self.assertEqual(['abc def'], soup.a.get_attribute_list('id')) def test_accept_charset(self): soup = self.soup('
') self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) def test_cdata_attribute_applying_only_to_one_tag(self): data = '' soup = self.soup(data) # We saw in another test that accept-charset is a cdata-list # attribute for the tag. But it's not a cdata-list # attribute for any other tag. self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) def test_string_has_immutable_name_property(self): string = self.soup("s").string self.assertEqual(None, string.name) def t(): string.name = 'foo' self.assertRaises(AttributeError, t) class TestPersistence(SoupTest): "Testing features like pickle and deepcopy." def setUp(self): super(TestPersistence, self).setUp() self.page = """ Beautiful Soup: We called him Tortoise because he taught us. foo bar """ self.tree = self.soup(self.page) def test_pickle_and_unpickle_identity(self): # Pickling a tree, then unpickling it, yields a tree identical # to the original. dumped = pickle.dumps(self.tree, 2) loaded = pickle.loads(dumped) self.assertEqual(loaded.__class__, BeautifulSoup) self.assertEqual(loaded.decode(), self.tree.decode()) def test_deepcopy_identity(self): # Making a deepcopy of a tree yields an identical tree. copied = copy.deepcopy(self.tree) self.assertEqual(copied.decode(), self.tree.decode()) def test_copy_preserves_encoding(self): soup = BeautifulSoup(b'

 

', 'html.parser') encoding = soup.original_encoding copy = soup.__copy__() self.assertEqual(u"

 

", unicode(copy)) self.assertEqual(encoding, copy.original_encoding) def test_unicode_pickle(self): # A tree containing Unicode characters can be pickled. html = u"\N{SNOWMAN}" soup = self.soup(html) dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) loaded = pickle.loads(dumped) self.assertEqual(loaded.decode(), soup.decode()) def test_copy_navigablestring_is_not_attached_to_tree(self): html = u"FooBar" soup = self.soup(html) s1 = soup.find(string="Foo") s2 = copy.copy(s1) self.assertEqual(s1, s2) self.assertEqual(None, s2.parent) self.assertEqual(None, s2.next_element) self.assertNotEqual(None, s1.next_sibling) self.assertEqual(None, s2.next_sibling) self.assertEqual(None, s2.previous_element) def test_copy_navigablestring_subclass_has_same_type(self): html = u"" soup = self.soup(html) s1 = soup.string s2 = copy.copy(s1) self.assertEqual(s1, s2) self.assertTrue(isinstance(s2, Comment)) def test_copy_entire_soup(self): html = u"
FooBar
end" soup = self.soup(html) soup_copy = copy.copy(soup) self.assertEqual(soup, soup_copy) def test_copy_tag_copies_contents(self): html = u"
FooBar
end" soup = self.soup(html) div = soup.div div_copy = copy.copy(div) # The two tags look the same, and evaluate to equal. self.assertEqual(unicode(div), unicode(div_copy)) self.assertEqual(div, div_copy) # But they're not the same object. self.assertFalse(div is div_copy) # And they don't have the same relation to the parse tree. The # copy is not associated with a parse tree at all. self.assertEqual(None, div_copy.parent) self.assertEqual(None, div_copy.previous_element) self.assertEqual(None, div_copy.find(string='Bar').next_element) self.assertNotEqual(None, div.find(string='Bar').next_element) class TestSubstitutions(SoupTest): def test_default_formatter_is_minimal(self): markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter="minimal") # The < is converted back into < but the e-with-acute is left alone. self.assertEqual( decoded, self.document_for( u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) def test_formatter_html(self): markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter="html") self.assertEqual( decoded, self.document_for("<<Sacré bleu!>>")) def test_formatter_minimal(self): markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter="minimal") # The < is converted back into < but the e-with-acute is left alone. self.assertEqual( decoded, self.document_for( u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) def test_formatter_null(self): markup = u"<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" soup = self.soup(markup) decoded = soup.decode(formatter=None) # Neither the angle brackets nor the e-with-acute are converted. # This is not valid HTML, but it's what the user wanted. self.assertEqual(decoded, self.document_for(u"<>")) def test_formatter_custom(self): markup = u"<foo>bar" soup = self.soup(markup) decoded = soup.decode(formatter = lambda x: x.upper()) # Instead of normal entity conversion code, the custom # callable is called on every string. self.assertEqual( decoded, self.document_for(u"BAR")) def test_formatter_is_run_on_attribute_values(self): markup = u'e' soup = self.soup(markup) a = soup.a expect_minimal = u'e' self.assertEqual(expect_minimal, a.decode()) self.assertEqual(expect_minimal, a.decode(formatter="minimal")) expect_html = u'e' self.assertEqual(expect_html, a.decode(formatter="html")) self.assertEqual(markup, a.decode(formatter=None)) expect_upper = u'E' self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) def test_formatter_skips_script_tag_for_html_documents(self): doc = """ """ encoded = BeautifulSoup(doc, 'html.parser').encode() self.assertTrue(b"< < hey > >" in encoded) def test_formatter_skips_style_tag_for_html_documents(self): doc = """ """ encoded = BeautifulSoup(doc, 'html.parser').encode() self.assertTrue(b"< < hey > >" in encoded) def test_prettify_leaves_preformatted_text_alone(self): soup = self.soup("
foo
  \tbar\n  \n  
baz ") # Everything outside the
 tag is reformatted, but everything
        # inside is left alone.
        self.assertEqual(
            u'
\n foo\n
  \tbar\n  \n  
\n baz\n
', soup.div.prettify()) def test_prettify_accepts_formatter(self): soup = BeautifulSoup("foo", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) self.assertTrue("FOO" in pretty) def test_prettify_outputs_unicode_by_default(self): soup = self.soup("") self.assertEqual(unicode, type(soup.prettify())) def test_prettify_can_encode_data(self): soup = self.soup("") self.assertEqual(bytes, type(soup.prettify("utf-8"))) def test_html_entity_substitution_off_by_default(self): markup = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" soup = self.soup(markup) encoded = soup.b.encode("utf-8") self.assertEqual(encoded, markup.encode('utf-8')) def test_encoding_substitution(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') soup = self.soup(meta_tag) # Parse the document, and the charset apprears unchanged. self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') # Encode the document into some encoding, and the encoding is # substituted into the meta tag. utf_8 = soup.encode("utf-8") self.assertTrue(b"charset=utf-8" in utf_8) euc_jp = soup.encode("euc_jp") self.assertTrue(b"charset=euc_jp" in euc_jp) shift_jis = soup.encode("shift-jis") self.assertTrue(b"charset=shift-jis" in shift_jis) utf_16_u = soup.encode("utf-16").decode("utf-16") self.assertTrue("charset=utf-16" in utf_16_u) def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): markup = ('
foo
') # Beautiful Soup used to try to rewrite the meta tag even if the # meta tag got filtered out by the strainer. This test makes # sure that doesn't happen. strainer = SoupStrainer('pre') soup = self.soup(markup, parse_only=strainer) self.assertEqual(soup.contents[0].name, 'pre') class TestEncoding(SoupTest): """Test the ability to encode objects into strings.""" def test_unicode_string_can_be_encoded(self): html = u"\N{SNOWMAN}" soup = self.soup(html) self.assertEqual(soup.b.string.encode("utf-8"), u"\N{SNOWMAN}".encode("utf-8")) def test_tag_containing_unicode_string_can_be_encoded(self): html = u"\N{SNOWMAN}" soup = self.soup(html) self.assertEqual( soup.b.encode("utf-8"), html.encode("utf-8")) def test_encoding_substitutes_unrecognized_characters_by_default(self): html = u"\N{SNOWMAN}" soup = self.soup(html) self.assertEqual(soup.b.encode("ascii"), b"") def test_encoding_can_be_made_strict(self): html = u"\N{SNOWMAN}" soup = self.soup(html) self.assertRaises( UnicodeEncodeError, soup.encode, "ascii", errors="strict") def test_decode_contents(self): html = u"\N{SNOWMAN}" soup = self.soup(html) self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) def test_encode_contents(self): html = u"\N{SNOWMAN}" soup = self.soup(html) self.assertEqual( u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( encoding="utf8")) def test_deprecated_renderContents(self): html = u"\N{SNOWMAN}" soup = self.soup(html) self.assertEqual( u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) def test_repr(self): html = u"\N{SNOWMAN}" soup = self.soup(html) if PY3K: self.assertEqual(html, repr(soup)) else: self.assertEqual(b'\\u2603', repr(soup)) class TestNavigableStringSubclasses(SoupTest): def test_cdata(self): # None of the current builders turn CDATA sections into CData # objects, but you can create them manually. soup = self.soup("") cdata = CData("foo") soup.insert(1, cdata) self.assertEqual(str(soup), "") self.assertEqual(soup.find(text="foo"), "foo") self.assertEqual(soup.contents[0], "foo") def test_cdata_is_never_formatted(self): """Text inside a CData object is passed into the formatter. But the return value is ignored. """ self.count = 0 def increment(*args): self.count += 1 return "BITTER FAILURE" soup = self.soup("") cdata = CData("<><><>") soup.insert(1, cdata) self.assertEqual( b"<><>]]>", soup.encode(formatter=increment)) self.assertEqual(1, self.count) def test_doctype_ends_in_newline(self): # Unlike other NavigableString subclasses, a DOCTYPE always ends # in a newline. doctype = Doctype("foo") soup = self.soup("") soup.insert(1, doctype) self.assertEqual(soup.encode(), b"\n") def test_declaration(self): d = Declaration("foo") self.assertEqual("", d.output_ready()) class TestSoupSelector(TreeTest): HTML = """ The title Hello there.

An H1

Some text

Some more text

An H2

Another

Bob

Another H2

me span1a1 span1a2 test span2a1

English

English UK

English US

French

""" def setUp(self): self.soup = BeautifulSoup(self.HTML, 'html.parser') def assertSelects(self, selector, expected_ids, **kwargs): el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)] el_ids.sort() expected_ids.sort() self.assertEqual(expected_ids, el_ids, "Selector %s, expected [%s], got [%s]" % ( selector, ', '.join(expected_ids), ', '.join(el_ids) ) ) assertSelect = assertSelects def assertSelectMultiple(self, *tests): for selector, expected_ids in tests: self.assertSelect(selector, expected_ids) def test_one_tag_one(self): els = self.soup.select('title') self.assertEqual(len(els), 1) self.assertEqual(els[0].name, 'title') self.assertEqual(els[0].contents, [u'The title']) def test_one_tag_many(self): els = self.soup.select('div') self.assertEqual(len(els), 4) for div in els: self.assertEqual(div.name, 'div') el = self.soup.select_one('div') self.assertEqual('main', el['id']) def test_select_one_returns_none_if_no_match(self): match = self.soup.select_one('nonexistenttag') self.assertEqual(None, match) def test_tag_in_tag_one(self): els = self.soup.select('div div') self.assertSelects('div div', ['inner', 'data1']) def test_tag_in_tag_many(self): for selector in ('html div', 'html body div', 'body div'): self.assertSelects(selector, ['data1', 'main', 'inner', 'footer']) def test_limit(self): self.assertSelects('html div', ['main'], limit=1) self.assertSelects('html body div', ['inner', 'main'], limit=2) self.assertSelects('body div', ['data1', 'main', 'inner', 'footer'], limit=10) def test_tag_no_match(self): self.assertEqual(len(self.soup.select('del')), 0) def test_invalid_tag(self): self.assertRaises(ValueError, self.soup.select, 'tag%t') def test_select_dashed_tag_ids(self): self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) def test_select_dashed_by_id(self): dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') self.assertEqual(dashed[0].name, 'custom-dashed-tag') self.assertEqual(dashed[0]['id'], 'dash2') def test_dashed_tag_text(self): self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.') def test_select_dashed_matches_find_all(self): self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) def test_header_tags(self): self.assertSelectMultiple( ('h1', ['header1']), ('h2', ['header2', 'header3']), ) def test_class_one(self): for selector in ('.onep', 'p.onep', 'html p.onep'): els = self.soup.select(selector) self.assertEqual(len(els), 1) self.assertEqual(els[0].name, 'p') self.assertEqual(els[0]['class'], ['onep']) def test_class_mismatched_tag(self): els = self.soup.select('div.onep') self.assertEqual(len(els), 0) def test_one_id(self): for selector in ('div#inner', '#inner', 'div div#inner'): self.assertSelects(selector, ['inner']) def test_bad_id(self): els = self.soup.select('#doesnotexist') self.assertEqual(len(els), 0) def test_items_in_id(self): els = self.soup.select('div#inner p') self.assertEqual(len(els), 3) for el in els: self.assertEqual(el.name, 'p') self.assertEqual(els[1]['class'], ['onep']) self.assertFalse(els[0].has_attr('class')) def test_a_bunch_of_emptys(self): for selector in ('div#main del', 'div#main div.oops', 'div div#main'): self.assertEqual(len(self.soup.select(selector)), 0) def test_multi_class_support(self): for selector in ('.class1', 'p.class1', '.class2', 'p.class2', '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): self.assertSelects(selector, ['pmulti']) def test_multi_class_selection(self): for selector in ('.class1.class3', '.class3.class2', '.class1.class2.class3'): self.assertSelects(selector, ['pmulti']) def test_child_selector(self): self.assertSelects('.s1 > a', ['s1a1', 's1a2']) self.assertSelects('.s1 > a span', ['s1a2s1']) def test_child_selector_id(self): self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) def test_attribute_equals(self): self.assertSelectMultiple( ('p[class="onep"]', ['p1']), ('p[id="p1"]', ['p1']), ('[class="onep"]', ['p1']), ('[id="p1"]', ['p1']), ('link[rel="stylesheet"]', ['l1']), ('link[type="text/css"]', ['l1']), ('link[href="blah.css"]', ['l1']), ('link[href="no-blah.css"]', []), ('[rel="stylesheet"]', ['l1']), ('[type="text/css"]', ['l1']), ('[href="blah.css"]', ['l1']), ('[href="no-blah.css"]', []), ('p[href="no-blah.css"]', []), ('[href="no-blah.css"]', []), ) def test_attribute_tilde(self): self.assertSelectMultiple( ('p[class~="class1"]', ['pmulti']), ('p[class~="class2"]', ['pmulti']), ('p[class~="class3"]', ['pmulti']), ('[class~="class1"]', ['pmulti']), ('[class~="class2"]', ['pmulti']), ('[class~="class3"]', ['pmulti']), ('a[rel~="friend"]', ['bob']), ('a[rel~="met"]', ['bob']), ('[rel~="friend"]', ['bob']), ('[rel~="met"]', ['bob']), ) def test_attribute_startswith(self): self.assertSelectMultiple( ('[rel^="style"]', ['l1']), ('link[rel^="style"]', ['l1']), ('notlink[rel^="notstyle"]', []), ('[rel^="notstyle"]', []), ('link[rel^="notstyle"]', []), ('link[href^="bla"]', ['l1']), ('a[href^="http://"]', ['bob', 'me']), ('[href^="http://"]', ['bob', 'me']), ('[id^="p"]', ['pmulti', 'p1']), ('[id^="m"]', ['me', 'main']), ('div[id^="m"]', ['main']), ('a[id^="m"]', ['me']), ('div[data-tag^="dashed"]', ['data1']) ) def test_attribute_endswith(self): self.assertSelectMultiple( ('[href$=".css"]', ['l1']), ('link[href$=".css"]', ['l1']), ('link[id$="1"]', ['l1']), ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), ('div[id$="1"]', ['data1']), ('[id$="noending"]', []), ) def test_attribute_contains(self): self.assertSelectMultiple( # From test_attribute_startswith ('[rel*="style"]', ['l1']), ('link[rel*="style"]', ['l1']), ('notlink[rel*="notstyle"]', []), ('[rel*="notstyle"]', []), ('link[rel*="notstyle"]', []), ('link[href*="bla"]', ['l1']), ('[href*="http://"]', ['bob', 'me']), ('[id*="p"]', ['pmulti', 'p1']), ('div[id*="m"]', ['main']), ('a[id*="m"]', ['me']), # From test_attribute_endswith ('[href*=".css"]', ['l1']), ('link[href*=".css"]', ['l1']), ('link[id*="1"]', ['l1']), ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), ('div[id*="1"]', ['data1']), ('[id*="noending"]', []), # New for this test ('[href*="."]', ['bob', 'me', 'l1']), ('a[href*="."]', ['bob', 'me']), ('link[href*="."]', ['l1']), ('div[id*="n"]', ['main', 'inner']), ('div[id*="nn"]', ['inner']), ('div[data-tag*="edval"]', ['data1']) ) def test_attribute_exact_or_hypen(self): self.assertSelectMultiple( ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), ('p[lang|="fr"]', ['lang-fr']), ('p[lang|="gb"]', []), ) def test_attribute_exists(self): self.assertSelectMultiple( ('[rel]', ['l1', 'bob', 'me']), ('link[rel]', ['l1']), ('a[rel]', ['bob', 'me']), ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), ('p[class]', ['p1', 'pmulti']), ('[blah]', []), ('p[blah]', []), ('div[data-tag]', ['data1']) ) def test_quoted_space_in_selector_name(self): html = """
nope
yes
""" soup = BeautifulSoup(html, 'html.parser') [chosen] = soup.select('div[style="display: right"]') self.assertEqual("yes", chosen.string) def test_unsupported_pseudoclass(self): self.assertRaises( NotImplementedError, self.soup.select, "a:no-such-pseudoclass") self.assertRaises( NotImplementedError, self.soup.select, "a:nth-of-type(a)") def test_nth_of_type(self): # Try to select first paragraph els = self.soup.select('div#inner p:nth-of-type(1)') self.assertEqual(len(els), 1) self.assertEqual(els[0].string, u'Some text') # Try to select third paragraph els = self.soup.select('div#inner p:nth-of-type(3)') self.assertEqual(len(els), 1) self.assertEqual(els[0].string, u'Another') # Try to select (non-existent!) fourth paragraph els = self.soup.select('div#inner p:nth-of-type(4)') self.assertEqual(len(els), 0) # Pass in an invalid value. self.assertRaises( ValueError, self.soup.select, 'div p:nth-of-type(0)') def test_nth_of_type_direct_descendant(self): els = self.soup.select('div#inner > p:nth-of-type(1)') self.assertEqual(len(els), 1) self.assertEqual(els[0].string, u'Some text') def test_id_child_selector_nth_of_type(self): self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) def test_select_on_element(self): # Other tests operate on the tree; this operates on an element # within the tree. inner = self.soup.find("div", id="main") selected = inner.select("div") # The
tag was selected. The