blob: ddd1b97dcb1a41e751427d6b98fe6a93456a4b7b [file] [log] [blame]
rjw1f884582022-01-06 17:20:42 +08001"""
2BitBake code parser
3
4Parses actual code (i.e. python and shell) for functions and in-line
5expressions. Used mainly to determine dependencies on other functions
6and variables within the BitBake metadata. Also provides a cache for
7this information in order to speed up processing.
8
9(Not to be confused with the code that parses the metadata itself,
10see lib/bb/parse/ for that).
11
12NOTE: if you change how the parsers gather information you will almost
13certainly need to increment CodeParserCache.CACHE_VERSION below so that
14any existing codeparser cache gets invalidated. Additionally you'll need
15to increment __cache_version__ in cache.py in order to ensure that old
16recipe caches don't trigger "Taskhash mismatch" errors.
17
18"""
19
20import ast
21import sys
22import codegen
23import logging
24import pickle
25import bb.pysh as pysh
26import os.path
27import bb.utils, bb.data
28import hashlib
29from itertools import chain
30from bb.pysh import pyshyacc, pyshlex, sherrors
31from bb.cache import MultiProcessCache
32
33logger = logging.getLogger('BitBake.CodeParser')
34
35def bbhash(s):
36 return hashlib.md5(s.encode("utf-8")).hexdigest()
37
38def check_indent(codestr):
39 """If the code is indented, add a top level piece of code to 'remove' the indentation"""
40
41 i = 0
42 while codestr[i] in ["\n", "\t", " "]:
43 i = i + 1
44
45 if i == 0:
46 return codestr
47
48 if codestr[i-1] == "\t" or codestr[i-1] == " ":
49 if codestr[0] == "\n":
50 # Since we're adding a line, we need to remove one line of any empty padding
51 # to ensure line numbers are correct
52 codestr = codestr[1:]
53 return "if 1:\n" + codestr
54
55 return codestr
56
57
58# Basically pickle, in python 2.7.3 at least, does badly with data duplication
59# upon pickling and unpickling. Combine this with duplicate objects and things
60# are a mess.
61#
62# When the sets are originally created, python calls intern() on the set keys
63# which significantly improves memory usage. Sadly the pickle/unpickle process
64# doesn't call intern() on the keys and results in the same strings being duplicated
65# in memory. This also means pickle will save the same string multiple times in
66# the cache file.
67#
68# By having shell and python cacheline objects with setstate/getstate, we force
69# the object creation through our own routine where we can call intern (via internSet).
70#
71# We also use hashable frozensets and ensure we use references to these so that
72# duplicates can be removed, both in memory and in the resulting pickled data.
73#
74# By playing these games, the size of the cache file shrinks dramatically
75# meaning faster load times and the reloaded cache files also consume much less
76# memory. Smaller cache files, faster load times and lower memory usage is good.
77#
78# A custom getstate/setstate using tuples is actually worth 15% cachesize by
79# avoiding duplication of the attribute names!
80
81class SetCache(object):
82 def __init__(self):
83 self.setcache = {}
84
85 def internSet(self, items):
86
87 new = []
88 for i in items:
89 new.append(sys.intern(i))
90 s = frozenset(new)
91 h = hash(s)
92 if h in self.setcache:
93 return self.setcache[h]
94 self.setcache[h] = s
95 return s
96
97codecache = SetCache()
98
99class pythonCacheLine(object):
100 def __init__(self, refs, execs, contains):
101 self.refs = codecache.internSet(refs)
102 self.execs = codecache.internSet(execs)
103 self.contains = {}
104 for c in contains:
105 self.contains[c] = codecache.internSet(contains[c])
106
107 def __getstate__(self):
108 return (self.refs, self.execs, self.contains)
109
110 def __setstate__(self, state):
111 (refs, execs, contains) = state
112 self.__init__(refs, execs, contains)
113 def __hash__(self):
114 l = (hash(self.refs), hash(self.execs))
115 for c in sorted(self.contains.keys()):
116 l = l + (c, hash(self.contains[c]))
117 return hash(l)
118 def __repr__(self):
119 return " ".join([str(self.refs), str(self.execs), str(self.contains)])
120
121
122class shellCacheLine(object):
123 def __init__(self, execs):
124 self.execs = codecache.internSet(execs)
125
126 def __getstate__(self):
127 return (self.execs)
128
129 def __setstate__(self, state):
130 (execs) = state
131 self.__init__(execs)
132 def __hash__(self):
133 return hash(self.execs)
134 def __repr__(self):
135 return str(self.execs)
136
137class CodeParserCache(MultiProcessCache):
138 cache_file_name = "bb_codeparser.dat"
139 # NOTE: you must increment this if you change how the parsers gather information,
140 # so that an existing cache gets invalidated. Additionally you'll need
141 # to increment __cache_version__ in cache.py in order to ensure that old
142 # recipe caches don't trigger "Taskhash mismatch" errors.
143 CACHE_VERSION = 10
144
145 def __init__(self):
146 MultiProcessCache.__init__(self)
147 self.pythoncache = self.cachedata[0]
148 self.shellcache = self.cachedata[1]
149 self.pythoncacheextras = self.cachedata_extras[0]
150 self.shellcacheextras = self.cachedata_extras[1]
151
152 # To avoid duplication in the codeparser cache, keep
153 # a lookup of hashes of objects we already have
154 self.pythoncachelines = {}
155 self.shellcachelines = {}
156
157 def newPythonCacheLine(self, refs, execs, contains):
158 cacheline = pythonCacheLine(refs, execs, contains)
159 h = hash(cacheline)
160 if h in self.pythoncachelines:
161 return self.pythoncachelines[h]
162 self.pythoncachelines[h] = cacheline
163 return cacheline
164
165 def newShellCacheLine(self, execs):
166 cacheline = shellCacheLine(execs)
167 h = hash(cacheline)
168 if h in self.shellcachelines:
169 return self.shellcachelines[h]
170 self.shellcachelines[h] = cacheline
171 return cacheline
172
173 def init_cache(self, d):
174 # Check if we already have the caches
175 if self.pythoncache:
176 return
177
178 MultiProcessCache.init_cache(self, d)
179
180 # cachedata gets re-assigned in the parent
181 self.pythoncache = self.cachedata[0]
182 self.shellcache = self.cachedata[1]
183
184 def create_cachedata(self):
185 data = [{}, {}]
186 return data
187
188codeparsercache = CodeParserCache()
189
190def parser_cache_init(d):
191 codeparsercache.init_cache(d)
192
193def parser_cache_save():
194 codeparsercache.save_extras()
195
196def parser_cache_savemerge():
197 codeparsercache.save_merge()
198
199Logger = logging.getLoggerClass()
200class BufferedLogger(Logger):
201 def __init__(self, name, level=0, target=None):
202 Logger.__init__(self, name)
203 self.setLevel(level)
204 self.buffer = []
205 self.target = target
206
207 def handle(self, record):
208 self.buffer.append(record)
209
210 def flush(self):
211 for record in self.buffer:
212 if self.target.isEnabledFor(record.levelno):
213 self.target.handle(record)
214 self.buffer = []
215
216class PythonParser():
217 getvars = (".getVar", ".appendVar", ".prependVar", "oe.utils.conditional")
218 getvarflags = (".getVarFlag", ".appendVarFlag", ".prependVarFlag")
219 containsfuncs = ("bb.utils.contains", "base_contains")
220 containsanyfuncs = ("bb.utils.contains_any", "bb.utils.filter")
221 execfuncs = ("bb.build.exec_func", "bb.build.exec_task")
222
223 def warn(self, func, arg):
224 """Warn about calls of bitbake APIs which pass a non-literal
225 argument for the variable name, as we're not able to track such
226 a reference.
227 """
228
229 try:
230 funcstr = codegen.to_source(func)
231 argstr = codegen.to_source(arg)
232 except TypeError:
233 self.log.debug(2, 'Failed to convert function and argument to source form')
234 else:
235 self.log.debug(1, self.unhandled_message % (funcstr, argstr))
236
237 def visit_Call(self, node):
238 name = self.called_node_name(node.func)
239 if name and (name.endswith(self.getvars) or name.endswith(self.getvarflags) or name in self.containsfuncs or name in self.containsanyfuncs):
240 if isinstance(node.args[0], ast.Str):
241 varname = node.args[0].s
242 if name in self.containsfuncs and isinstance(node.args[1], ast.Str):
243 if varname not in self.contains:
244 self.contains[varname] = set()
245 self.contains[varname].add(node.args[1].s)
246 elif name in self.containsanyfuncs and isinstance(node.args[1], ast.Str):
247 if varname not in self.contains:
248 self.contains[varname] = set()
249 self.contains[varname].update(node.args[1].s.split())
250 elif name.endswith(self.getvarflags):
251 if isinstance(node.args[1], ast.Str):
252 self.references.add('%s[%s]' % (varname, node.args[1].s))
253 else:
254 self.warn(node.func, node.args[1])
255 else:
256 self.references.add(varname)
257 else:
258 self.warn(node.func, node.args[0])
259 elif name and name.endswith(".expand"):
260 if isinstance(node.args[0], ast.Str):
261 value = node.args[0].s
262 d = bb.data.init()
263 parser = d.expandWithRefs(value, self.name)
264 self.references |= parser.references
265 self.execs |= parser.execs
266 for varname in parser.contains:
267 if varname not in self.contains:
268 self.contains[varname] = set()
269 self.contains[varname] |= parser.contains[varname]
270 elif name in self.execfuncs:
271 if isinstance(node.args[0], ast.Str):
272 self.var_execs.add(node.args[0].s)
273 else:
274 self.warn(node.func, node.args[0])
275 elif name and isinstance(node.func, (ast.Name, ast.Attribute)):
276 self.execs.add(name)
277
278 def called_node_name(self, node):
279 """Given a called node, return its original string form"""
280 components = []
281 while node:
282 if isinstance(node, ast.Attribute):
283 components.append(node.attr)
284 node = node.value
285 elif isinstance(node, ast.Name):
286 components.append(node.id)
287 return '.'.join(reversed(components))
288 else:
289 break
290
291 def __init__(self, name, log):
292 self.name = name
293 self.var_execs = set()
294 self.contains = {}
295 self.execs = set()
296 self.references = set()
297 self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, log)
298
299 self.unhandled_message = "in call of %s, argument '%s' is not a string literal"
300 self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message)
301
302 def parse_python(self, node, lineno=0, filename="<string>"):
303 if not node or not node.strip():
304 return
305
306 h = bbhash(str(node))
307
308 if h in codeparsercache.pythoncache:
309 self.references = set(codeparsercache.pythoncache[h].refs)
310 self.execs = set(codeparsercache.pythoncache[h].execs)
311 self.contains = {}
312 for i in codeparsercache.pythoncache[h].contains:
313 self.contains[i] = set(codeparsercache.pythoncache[h].contains[i])
314 return
315
316 if h in codeparsercache.pythoncacheextras:
317 self.references = set(codeparsercache.pythoncacheextras[h].refs)
318 self.execs = set(codeparsercache.pythoncacheextras[h].execs)
319 self.contains = {}
320 for i in codeparsercache.pythoncacheextras[h].contains:
321 self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i])
322 return
323
324 # We can't add to the linenumbers for compile, we can pad to the correct number of blank lines though
325 node = "\n" * int(lineno) + node
326 code = compile(check_indent(str(node)), filename, "exec",
327 ast.PyCF_ONLY_AST)
328
329 for n in ast.walk(code):
330 if n.__class__.__name__ == "Call":
331 self.visit_Call(n)
332
333 self.execs.update(self.var_execs)
334
335 codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains)
336
337class ShellParser():
338 def __init__(self, name, log):
339 self.funcdefs = set()
340 self.allexecs = set()
341 self.execs = set()
342 self.log = BufferedLogger('BitBake.Data.%s' % name, logging.DEBUG, log)
343 self.unhandled_template = "unable to handle non-literal command '%s'"
344 self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template)
345
346 def parse_shell(self, value):
347 """Parse the supplied shell code in a string, returning the external
348 commands it executes.
349 """
350
351 h = bbhash(str(value))
352
353 if h in codeparsercache.shellcache:
354 self.execs = set(codeparsercache.shellcache[h].execs)
355 return self.execs
356
357 if h in codeparsercache.shellcacheextras:
358 self.execs = set(codeparsercache.shellcacheextras[h].execs)
359 return self.execs
360
361 self._parse_shell(value)
362 self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs)
363
364 codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs)
365
366 return self.execs
367
368 def _parse_shell(self, value):
369 try:
370 tokens, _ = pyshyacc.parse(value, eof=True, debug=False)
371 except pyshlex.NeedMore:
372 raise sherrors.ShellSyntaxError("Unexpected EOF")
373
374 self.process_tokens(tokens)
375
376 def process_tokens(self, tokens):
377 """Process a supplied portion of the syntax tree as returned by
378 pyshyacc.parse.
379 """
380
381 def function_definition(value):
382 self.funcdefs.add(value.name)
383 return [value.body], None
384
385 def case_clause(value):
386 # Element 0 of each item in the case is the list of patterns, and
387 # Element 1 of each item in the case is the list of commands to be
388 # executed when that pattern matches.
389 words = chain(*[item[0] for item in value.items])
390 cmds = chain(*[item[1] for item in value.items])
391 return cmds, words
392
393 def if_clause(value):
394 main = chain(value.cond, value.if_cmds)
395 rest = value.else_cmds
396 if isinstance(rest, tuple) and rest[0] == "elif":
397 return chain(main, if_clause(rest[1]))
398 else:
399 return chain(main, rest)
400
401 def simple_command(value):
402 return None, chain(value.words, (assign[1] for assign in value.assigns))
403
404 token_handlers = {
405 "and_or": lambda x: ((x.left, x.right), None),
406 "async": lambda x: ([x], None),
407 "brace_group": lambda x: (x.cmds, None),
408 "for_clause": lambda x: (x.cmds, x.items),
409 "function_definition": function_definition,
410 "if_clause": lambda x: (if_clause(x), None),
411 "pipeline": lambda x: (x.commands, None),
412 "redirect_list": lambda x: ([x.cmd], None),
413 "subshell": lambda x: (x.cmds, None),
414 "while_clause": lambda x: (chain(x.condition, x.cmds), None),
415 "until_clause": lambda x: (chain(x.condition, x.cmds), None),
416 "simple_command": simple_command,
417 "case_clause": case_clause,
418 }
419
420 def process_token_list(tokens):
421 for token in tokens:
422 if isinstance(token, list):
423 process_token_list(token)
424 continue
425 name, value = token
426 try:
427 more_tokens, words = token_handlers[name](value)
428 except KeyError:
429 raise NotImplementedError("Unsupported token type " + name)
430
431 if more_tokens:
432 self.process_tokens(more_tokens)
433
434 if words:
435 self.process_words(words)
436
437 process_token_list(tokens)
438
439 def process_words(self, words):
440 """Process a set of 'words' in pyshyacc parlance, which includes
441 extraction of executed commands from $() blocks, as well as grabbing
442 the command name argument.
443 """
444
445 words = list(words)
446 for word in list(words):
447 wtree = pyshlex.make_wordtree(word[1])
448 for part in wtree:
449 if not isinstance(part, list):
450 continue
451
452 if part[0] in ('`', '$('):
453 command = pyshlex.wordtree_as_string(part[1:-1])
454 self._parse_shell(command)
455
456 if word[0] in ("cmd_name", "cmd_word"):
457 if word in words:
458 words.remove(word)
459
460 usetoken = False
461 for word in words:
462 if word[0] in ("cmd_name", "cmd_word") or \
463 (usetoken and word[0] == "TOKEN"):
464 if "=" in word[1]:
465 usetoken = True
466 continue
467
468 cmd = word[1]
469 if cmd.startswith("$"):
470 self.log.debug(1, self.unhandled_template % cmd)
471 elif cmd == "eval":
472 command = " ".join(word for _, word in words[1:])
473 self._parse_shell(command)
474 else:
475 self.allexecs.add(cmd)
476 break