blob: 083395fb46ce803fd2679d875f15481564627f6e [file] [log] [blame]
rjw1f884582022-01-06 17:20:42 +08001"""Diagnostic functions, mainly for use when doing tech support."""
2
3__license__ = "MIT"
4
5import cProfile
6from io import StringIO
7from html.parser import HTMLParser
8import bs4
9from bs4 import BeautifulSoup, __version__
10from bs4.builder import builder_registry
11
12import os
13import pstats
14import random
15import tempfile
16import time
17import traceback
18import sys
19import cProfile
20
21def diagnose(data):
22 """Diagnostic suite for isolating common problems."""
23 print("Diagnostic running on Beautiful Soup %s" % __version__)
24 print("Python version %s" % sys.version)
25
26 basic_parsers = ["html.parser", "html5lib", "lxml"]
27 for name in basic_parsers:
28 for builder in builder_registry.builders:
29 if name in builder.features:
30 break
31 else:
32 basic_parsers.remove(name)
33 print((
34 "I noticed that %s is not installed. Installing it may help." %
35 name))
36
37 if 'lxml' in basic_parsers:
38 basic_parsers.append(["lxml", "xml"])
39 try:
40 from lxml import etree
41 print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
42 except ImportError as e:
43 print (
44 "lxml is not installed or couldn't be imported.")
45
46
47 if 'html5lib' in basic_parsers:
48 try:
49 import html5lib
50 print("Found html5lib version %s" % html5lib.__version__)
51 except ImportError as e:
52 print (
53 "html5lib is not installed or couldn't be imported.")
54
55 if hasattr(data, 'read'):
56 data = data.read()
57 elif os.path.exists(data):
58 print('"%s" looks like a filename. Reading data from the file.' % data)
59 data = open(data).read()
60 elif data.startswith("http:") or data.startswith("https:"):
61 print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
62 print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
63 return
64 print()
65
66 for parser in basic_parsers:
67 print("Trying to parse your markup with %s" % parser)
68 success = False
69 try:
70 soup = BeautifulSoup(data, parser)
71 success = True
72 except Exception as e:
73 print("%s could not parse the markup." % parser)
74 traceback.print_exc()
75 if success:
76 print("Here's what %s did with the markup:" % parser)
77 print(soup.prettify())
78
79 print("-" * 80)
80
81def lxml_trace(data, html=True, **kwargs):
82 """Print out the lxml events that occur during parsing.
83
84 This lets you see how lxml parses a document when no Beautiful
85 Soup code is running.
86 """
87 from lxml import etree
88 for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
89 print(("%s, %4s, %s" % (event, element.tag, element.text)))
90
91class AnnouncingParser(HTMLParser):
92 """Announces HTMLParser parse events, without doing anything else."""
93
94 def _p(self, s):
95 print(s)
96
97 def handle_starttag(self, name, attrs):
98 self._p("%s START" % name)
99
100 def handle_endtag(self, name):
101 self._p("%s END" % name)
102
103 def handle_data(self, data):
104 self._p("%s DATA" % data)
105
106 def handle_charref(self, name):
107 self._p("%s CHARREF" % name)
108
109 def handle_entityref(self, name):
110 self._p("%s ENTITYREF" % name)
111
112 def handle_comment(self, data):
113 self._p("%s COMMENT" % data)
114
115 def handle_decl(self, data):
116 self._p("%s DECL" % data)
117
118 def unknown_decl(self, data):
119 self._p("%s UNKNOWN-DECL" % data)
120
121 def handle_pi(self, data):
122 self._p("%s PI" % data)
123
124def htmlparser_trace(data):
125 """Print out the HTMLParser events that occur during parsing.
126
127 This lets you see how HTMLParser parses a document when no
128 Beautiful Soup code is running.
129 """
130 parser = AnnouncingParser()
131 parser.feed(data)
132
133_vowels = "aeiou"
134_consonants = "bcdfghjklmnpqrstvwxyz"
135
136def rword(length=5):
137 "Generate a random word-like string."
138 s = ''
139 for i in range(length):
140 if i % 2 == 0:
141 t = _consonants
142 else:
143 t = _vowels
144 s += random.choice(t)
145 return s
146
147def rsentence(length=4):
148 "Generate a random sentence-like string."
149 return " ".join(rword(random.randint(4,9)) for i in range(length))
150
151def rdoc(num_elements=1000):
152 """Randomly generate an invalid HTML document."""
153 tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
154 elements = []
155 for i in range(num_elements):
156 choice = random.randint(0,3)
157 if choice == 0:
158 # New tag.
159 tag_name = random.choice(tag_names)
160 elements.append("<%s>" % tag_name)
161 elif choice == 1:
162 elements.append(rsentence(random.randint(1,4)))
163 elif choice == 2:
164 # Close a tag.
165 tag_name = random.choice(tag_names)
166 elements.append("</%s>" % tag_name)
167 return "<html>" + "\n".join(elements) + "</html>"
168
169def benchmark_parsers(num_elements=100000):
170 """Very basic head-to-head performance benchmark."""
171 print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
172 data = rdoc(num_elements)
173 print("Generated a large invalid HTML document (%d bytes)." % len(data))
174
175 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
176 success = False
177 try:
178 a = time.time()
179 soup = BeautifulSoup(data, parser)
180 b = time.time()
181 success = True
182 except Exception as e:
183 print("%s could not parse the markup." % parser)
184 traceback.print_exc()
185 if success:
186 print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
187
188 from lxml import etree
189 a = time.time()
190 etree.HTML(data)
191 b = time.time()
192 print("Raw lxml parsed the markup in %.2fs." % (b-a))
193
194 import html5lib
195 parser = html5lib.HTMLParser()
196 a = time.time()
197 parser.parse(data)
198 b = time.time()
199 print("Raw html5lib parsed the markup in %.2fs." % (b-a))
200
201def profile(num_elements=100000, parser="lxml"):
202
203 filehandle = tempfile.NamedTemporaryFile()
204 filename = filehandle.name
205
206 data = rdoc(num_elements)
207 vars = dict(bs4=bs4, data=data, parser=parser)
208 cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
209
210 stats = pstats.Stats(filename)
211 # stats.strip_dirs()
212 stats.sort_stats("cumulative")
213 stats.print_stats('_html5lib|bs4', 50)
214
215if __name__ == '__main__':
216 diagnose(sys.stdin.read())