Blame - meta/poky/bitbake/lib/bs4/diagnose.py - T103

blob: 083395fb46ce803fd2679d875f15481564627f6e [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	"""Diagnostic functions, mainly for use when doing tech support."""
				2
				3	__license__ = "MIT"
				4
				5	import cProfile
				6	from io import StringIO
				7	from html.parser import HTMLParser
				8	import bs4
				9	from bs4 import BeautifulSoup, __version__
				10	from bs4.builder import builder_registry
				11
				12	import os
				13	import pstats
				14	import random
				15	import tempfile
				16	import time
				17	import traceback
				18	import sys
				19	import cProfile
				20
				21	def diagnose(data):
				22	"""Diagnostic suite for isolating common problems."""
				23	print("Diagnostic running on Beautiful Soup %s" % __version__)
				24	print("Python version %s" % sys.version)
				25
				26	basic_parsers = ["html.parser", "html5lib", "lxml"]
				27	for name in basic_parsers:
				28	for builder in builder_registry.builders:
				29	if name in builder.features:
				30	break
				31	else:
				32	basic_parsers.remove(name)
				33	print((
				34	"I noticed that %s is not installed. Installing it may help." %
				35	name))
				36
				37	if 'lxml' in basic_parsers:
				38	basic_parsers.append(["lxml", "xml"])
				39	try:
				40	from lxml import etree
				41	print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
				42	except ImportError as e:
				43	print (
				44	"lxml is not installed or couldn't be imported.")
				45
				46
				47	if 'html5lib' in basic_parsers:
				48	try:
				49	import html5lib
				50	print("Found html5lib version %s" % html5lib.__version__)
				51	except ImportError as e:
				52	print (
				53	"html5lib is not installed or couldn't be imported.")
				54
				55	if hasattr(data, 'read'):
				56	data = data.read()
				57	elif os.path.exists(data):
				58	print('"%s" looks like a filename. Reading data from the file.' % data)
				59	data = open(data).read()
				60	elif data.startswith("http:") or data.startswith("https:"):
				61	print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
				62	print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
				63	return
				64	print()
				65
				66	for parser in basic_parsers:
				67	print("Trying to parse your markup with %s" % parser)
				68	success = False
				69	try:
				70	soup = BeautifulSoup(data, parser)
				71	success = True
				72	except Exception as e:
				73	print("%s could not parse the markup." % parser)
				74	traceback.print_exc()
				75	if success:
				76	print("Here's what %s did with the markup:" % parser)
				77	print(soup.prettify())
				78
				79	print("-" * 80)
				80
				81	def lxml_trace(data, html=True, **kwargs):
				82	"""Print out the lxml events that occur during parsing.
				83
				84	This lets you see how lxml parses a document when no Beautiful
				85	Soup code is running.
				86	"""
				87	from lxml import etree
				88	for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
				89	print(("%s, %4s, %s" % (event, element.tag, element.text)))
				90
				91	class AnnouncingParser(HTMLParser):
				92	"""Announces HTMLParser parse events, without doing anything else."""
				93
				94	def _p(self, s):
				95	print(s)
				96
				97	def handle_starttag(self, name, attrs):
				98	self._p("%s START" % name)
				99
				100	def handle_endtag(self, name):
				101	self._p("%s END" % name)
				102
				103	def handle_data(self, data):
				104	self._p("%s DATA" % data)
				105
				106	def handle_charref(self, name):
				107	self._p("%s CHARREF" % name)
				108
				109	def handle_entityref(self, name):
				110	self._p("%s ENTITYREF" % name)
				111
				112	def handle_comment(self, data):
				113	self._p("%s COMMENT" % data)
				114
				115	def handle_decl(self, data):
				116	self._p("%s DECL" % data)
				117
				118	def unknown_decl(self, data):
				119	self._p("%s UNKNOWN-DECL" % data)
				120
				121	def handle_pi(self, data):
				122	self._p("%s PI" % data)
				123
				124	def htmlparser_trace(data):
				125	"""Print out the HTMLParser events that occur during parsing.
				126
				127	This lets you see how HTMLParser parses a document when no
				128	Beautiful Soup code is running.
				129	"""
				130	parser = AnnouncingParser()
				131	parser.feed(data)
				132
				133	_vowels = "aeiou"
				134	_consonants = "bcdfghjklmnpqrstvwxyz"
				135
				136	def rword(length=5):
				137	"Generate a random word-like string."
				138	s = ''
				139	for i in range(length):
				140	if i % 2 == 0:
				141	t = _consonants
				142	else:
				143	t = _vowels
				144	s += random.choice(t)
				145	return s
				146
				147	def rsentence(length=4):
				148	"Generate a random sentence-like string."
				149	return " ".join(rword(random.randint(4,9)) for i in range(length))
				150
				151	def rdoc(num_elements=1000):
				152	"""Randomly generate an invalid HTML document."""
				153	tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
				154	elements = []
				155	for i in range(num_elements):
				156	choice = random.randint(0,3)
				157	if choice == 0:
				158	# New tag.
				159	tag_name = random.choice(tag_names)
				160	elements.append("<%s>" % tag_name)
				161	elif choice == 1:
				162	elements.append(rsentence(random.randint(1,4)))
				163	elif choice == 2:
				164	# Close a tag.
				165	tag_name = random.choice(tag_names)
				166	elements.append("</%s>" % tag_name)
				167	return "<html>" + "\n".join(elements) + "</html>"
				168
				169	def benchmark_parsers(num_elements=100000):
				170	"""Very basic head-to-head performance benchmark."""
				171	print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
				172	data = rdoc(num_elements)
				173	print("Generated a large invalid HTML document (%d bytes)." % len(data))
				174
				175	for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
				176	success = False
				177	try:
				178	a = time.time()
				179	soup = BeautifulSoup(data, parser)
				180	b = time.time()
				181	success = True
				182	except Exception as e:
				183	print("%s could not parse the markup." % parser)
				184	traceback.print_exc()
				185	if success:
				186	print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
				187
				188	from lxml import etree
				189	a = time.time()
				190	etree.HTML(data)
				191	b = time.time()
				192	print("Raw lxml parsed the markup in %.2fs." % (b-a))
				193
				194	import html5lib
				195	parser = html5lib.HTMLParser()
				196	a = time.time()
				197	parser.parse(data)
				198	b = time.time()
				199	print("Raw html5lib parsed the markup in %.2fs." % (b-a))
				200
				201	def profile(num_elements=100000, parser="lxml"):
				202
				203	filehandle = tempfile.NamedTemporaryFile()
				204	filename = filehandle.name
				205
				206	data = rdoc(num_elements)
				207	vars = dict(bs4=bs4, data=data, parser=parser)
				208	cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
				209
				210	stats = pstats.Stats(filename)
				211	# stats.strip_dirs()
				212	stats.sort_stats("cumulative")
				213	stats.print_stats('_html5lib\|bs4', 50)
				214
				215	if __name__ == '__main__':
				216	diagnose(sys.stdin.read())