b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | # |
| 3 | # Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com> |
| 4 | # |
| 5 | # This is free software, licensed under the GNU General Public License v2. |
| 6 | # See /LICENSE for more information. |
| 7 | |
| 8 | import argparse |
| 9 | import calendar |
| 10 | import datetime |
| 11 | import errno |
| 12 | import fcntl |
| 13 | import hashlib |
| 14 | import json |
| 15 | import os |
| 16 | import os.path |
| 17 | import re |
| 18 | import shutil |
| 19 | import ssl |
| 20 | import subprocess |
| 21 | import sys |
| 22 | import time |
| 23 | import urllib.request |
| 24 | |
| 25 | TMPDIR = os.environ.get('TMP_DIR') or '/tmp' |
| 26 | TMPDIR_DL = os.path.join(TMPDIR, 'dl') |
| 27 | |
| 28 | |
| 29 | class PathException(Exception): pass |
| 30 | class DownloadGitHubError(Exception): pass |
| 31 | |
| 32 | |
| 33 | class Path(object): |
| 34 | """Context class for preparing and cleaning up directories. |
| 35 | |
| 36 | If ```preclean` is ``False``, ``path`` will NOT be removed on context enter |
| 37 | |
| 38 | If ``path`` ``isdir``, then it will be created on context enter. |
| 39 | |
| 40 | If ``keep`` is True, then ``path`` will NOT be removed on context exit |
| 41 | """ |
| 42 | |
| 43 | def __init__(self, path, isdir=True, preclean=False, keep=False): |
| 44 | self.path = path |
| 45 | self.isdir = isdir |
| 46 | self.preclean = preclean |
| 47 | self.keep = keep |
| 48 | |
| 49 | def __enter__(self): |
| 50 | if self.preclean: |
| 51 | self.rm_all(self.path) |
| 52 | if self.isdir: |
| 53 | self.mkdir_all(self.path) |
| 54 | return self |
| 55 | |
| 56 | def __exit__(self, exc_type, exc_value, traceback): |
| 57 | if not self.keep: |
| 58 | self.rm_all(self.path) |
| 59 | |
| 60 | @staticmethod |
| 61 | def mkdir_all(path): |
| 62 | """Same as mkdir -p.""" |
| 63 | names = os.path.split(path) |
| 64 | p = '' |
| 65 | for name in names: |
| 66 | p = os.path.join(p, name) |
| 67 | Path._mkdir(p) |
| 68 | |
| 69 | @staticmethod |
| 70 | def _rmdir_dir(dir_): |
| 71 | names = Path._listdir(dir_) |
| 72 | for name in names: |
| 73 | p = os.path.join(dir_, name) |
| 74 | Path.rm_all(p) |
| 75 | Path._rmdir(dir_) |
| 76 | |
| 77 | @staticmethod |
| 78 | def _mkdir(path): |
| 79 | Path._os_func(os.mkdir, path, errno.EEXIST) |
| 80 | |
| 81 | @staticmethod |
| 82 | def _rmdir(path): |
| 83 | Path._os_func(os.rmdir, path, errno.ENOENT) |
| 84 | |
| 85 | @staticmethod |
| 86 | def _remove(path): |
| 87 | Path._os_func(os.remove, path, errno.ENOENT) |
| 88 | |
| 89 | @staticmethod |
| 90 | def _listdir(path): |
| 91 | return Path._os_func(os.listdir, path, errno.ENOENT, default=[]) |
| 92 | |
| 93 | @staticmethod |
| 94 | def _os_func(func, path, errno, default=None): |
| 95 | """Call func(path) in an idempotent way. |
| 96 | |
| 97 | On exception ``ex``, if the type is OSError and ``ex.errno == errno``, |
| 98 | return ``default``, otherwise, re-raise |
| 99 | """ |
| 100 | try: |
| 101 | return func(path) |
| 102 | except OSError as e: |
| 103 | if e.errno == errno: |
| 104 | return default |
| 105 | else: |
| 106 | raise |
| 107 | |
| 108 | @staticmethod |
| 109 | def rm_all(path): |
| 110 | """Same as rm -r.""" |
| 111 | if os.path.islink(path): |
| 112 | Path._remove(path) |
| 113 | elif os.path.isdir(path): |
| 114 | Path._rmdir_dir(path) |
| 115 | else: |
| 116 | Path._remove(path) |
| 117 | |
| 118 | @staticmethod |
| 119 | def untar(path, into=None): |
| 120 | """Extract tarball at ``path`` into subdir ``into``. |
| 121 | |
| 122 | return subdir name if and only if there exists one, otherwise raise PathException |
| 123 | """ |
| 124 | args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions') |
| 125 | subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22)) |
| 126 | dirs = os.listdir(into) |
| 127 | if len(dirs) == 1: |
| 128 | return dirs[0] |
| 129 | else: |
| 130 | raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs)) |
| 131 | |
| 132 | @staticmethod |
| 133 | def tar(path, subdir, into=None, ts=None): |
| 134 | """Pack ``path`` into tarball ``into``.""" |
| 135 | # --sort=name requires a recent build of GNU tar |
| 136 | args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name', '--mode=a-s'] |
| 137 | args += ['-C', path, '-cf', into, subdir] |
| 138 | envs = os.environ.copy() |
| 139 | if ts is not None: |
| 140 | args.append('--mtime=@%d' % ts) |
| 141 | if into.endswith('.zst'): |
| 142 | args.append('-I zstd -T0 --ultra -20') |
| 143 | elif into.endswith('.xz'): |
| 144 | envs['XZ_OPT'] = '-7e' |
| 145 | args.append('-J') |
| 146 | elif into.endswith('.bz2'): |
| 147 | args.append('-j') |
| 148 | elif into.endswith('.gz'): |
| 149 | args.append('-z') |
| 150 | envs['GZIP'] = '-n' |
| 151 | else: |
| 152 | raise PathException('unknown compression type %s' % into) |
| 153 | subprocess.check_call(args, env=envs) |
| 154 | |
| 155 | |
| 156 | class GitHubCommitTsCache(object): |
| 157 | __cachef = 'github.commit.ts.cache' |
| 158 | __cachen = 2048 |
| 159 | |
| 160 | def __init__(self): |
| 161 | Path.mkdir_all(TMPDIR_DL) |
| 162 | self.cachef = os.path.join(TMPDIR_DL, self.__cachef) |
| 163 | self.cache = {} |
| 164 | |
| 165 | def get(self, k): |
| 166 | """Get timestamp with key ``k``.""" |
| 167 | fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT) |
| 168 | with os.fdopen(fileno) as fin: |
| 169 | try: |
| 170 | fcntl.lockf(fileno, fcntl.LOCK_SH) |
| 171 | self._cache_init(fin) |
| 172 | if k in self.cache: |
| 173 | ts = self.cache[k][0] |
| 174 | return ts |
| 175 | finally: |
| 176 | fcntl.lockf(fileno, fcntl.LOCK_UN) |
| 177 | return None |
| 178 | |
| 179 | def set(self, k, v): |
| 180 | """Update timestamp with ``k``.""" |
| 181 | fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT) |
| 182 | with os.fdopen(fileno, 'w+') as f: |
| 183 | try: |
| 184 | fcntl.lockf(fileno, fcntl.LOCK_EX) |
| 185 | self._cache_init(f) |
| 186 | self.cache[k] = (v, int(time.time())) |
| 187 | self._cache_flush(f) |
| 188 | finally: |
| 189 | fcntl.lockf(fileno, fcntl.LOCK_UN) |
| 190 | |
| 191 | def _cache_init(self, fin): |
| 192 | for line in fin: |
| 193 | k, ts, updated = line.split() |
| 194 | ts = int(ts) |
| 195 | updated = int(updated) |
| 196 | self.cache[k] = (ts, updated) |
| 197 | |
| 198 | def _cache_flush(self, fout): |
| 199 | cache = sorted(self.cache.items(), key=lambda a: a[1][1]) |
| 200 | cache = cache[:self.__cachen] |
| 201 | self.cache = {} |
| 202 | os.ftruncate(fout.fileno(), 0) |
| 203 | fout.seek(0, os.SEEK_SET) |
| 204 | for k, ent in cache: |
| 205 | ts = ent[0] |
| 206 | updated = ent[1] |
| 207 | line = '{0} {1} {2}\n'.format(k, ts, updated) |
| 208 | fout.write(line) |
| 209 | |
| 210 | |
| 211 | class DownloadGitHubTarball(object): |
| 212 | """Download and repack archive tarball from GitHub. |
| 213 | |
| 214 | Compared with the method of packing after cloning the whole repo, this |
| 215 | method is more friendly to users with fragile internet connection. |
| 216 | |
| 217 | However, there are limitations with this method |
| 218 | |
| 219 | - GitHub imposes a 60 reqs/hour limit for unauthenticated API access. |
| 220 | This affects fetching commit date for reproducible tarballs. Download |
| 221 | through the archive link is not affected. |
| 222 | |
| 223 | - GitHub archives do not contain source codes for submodules. |
| 224 | |
| 225 | - GitHub archives seem to respect .gitattributes and ignore paths with |
| 226 | export-ignore attributes. |
| 227 | |
| 228 | For the first two issues, the method will fail loudly to allow fallback to |
| 229 | clone-then-pack method. |
| 230 | |
| 231 | As for the 3rd issue, to make sure that this method only produces identical |
| 232 | tarballs as the fallback method, we require the expected hash value to be |
| 233 | supplied. That means the first tarball will need to be prepared by the |
| 234 | clone-then-pack method |
| 235 | """ |
| 236 | |
| 237 | __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)') |
| 238 | |
| 239 | def __init__(self, args): |
| 240 | self.dl_dir = args.dl_dir |
| 241 | self.version = args.version |
| 242 | self.subdir = args.subdir |
| 243 | self.source = args.source |
| 244 | self.submodules = args.submodules |
| 245 | self.url = args.url |
| 246 | self._init_owner_repo() |
| 247 | self.xhash = args.hash |
| 248 | self._init_hasher() |
| 249 | self.commit_ts = None # lazy load commit timestamp |
| 250 | self.commit_ts_cache = GitHubCommitTsCache() |
| 251 | self.name = 'github-tarball' |
| 252 | |
| 253 | def download(self): |
| 254 | """Download and repack GitHub archive tarball.""" |
| 255 | if self.submodules and self.submodules != ['skip']: |
| 256 | raise self._error('Fetching submodules is not yet supported') |
| 257 | self._init_commit_ts() |
| 258 | with Path(TMPDIR_DL, keep=True) as dir_dl: |
| 259 | # fetch tarball from GitHub |
| 260 | tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl') |
| 261 | with Path(tarball_path, isdir=False): |
| 262 | self._fetch(tarball_path) |
| 263 | # unpack |
| 264 | d = os.path.join(dir_dl.path, self.subdir + '.untar') |
| 265 | with Path(d, preclean=True) as dir_untar: |
| 266 | tarball_prefix = Path.untar(tarball_path, into=dir_untar.path) |
| 267 | dir0 = os.path.join(dir_untar.path, tarball_prefix) |
| 268 | dir1 = os.path.join(dir_untar.path, self.subdir) |
| 269 | # submodules check |
| 270 | if self.submodules != ['skip'] and self._has_submodule(dir0): |
| 271 | raise self._error('Fetching submodules is not yet supported') |
| 272 | # rename subdir |
| 273 | os.rename(dir0, dir1) |
| 274 | # repack |
| 275 | into=os.path.join(TMPDIR_DL, self.source) |
| 276 | Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts) |
| 277 | try: |
| 278 | self._hash_check(into) |
| 279 | except Exception: |
| 280 | Path.rm_all(into) |
| 281 | raise |
| 282 | # move to target location |
| 283 | file1 = os.path.join(self.dl_dir, self.source) |
| 284 | if into != file1: |
| 285 | shutil.move(into, file1) |
| 286 | |
| 287 | def _has_submodule(self, dir_): |
| 288 | m = os.path.join(dir_, '.gitmodules') |
| 289 | try: |
| 290 | st = os.stat(m) |
| 291 | return st.st_size > 0 |
| 292 | except OSError as e: |
| 293 | return e.errno != errno.ENOENT |
| 294 | |
| 295 | def _init_owner_repo(self): |
| 296 | m = self.__repo_url_regex.search(self.url) |
| 297 | if m is None: |
| 298 | raise self._error('Invalid github url: {}'.format(self.url)) |
| 299 | owner = m.group('owner') |
| 300 | repo = m.group('repo') |
| 301 | if repo.endswith('.git'): |
| 302 | repo = repo[:-4] |
| 303 | self.owner = owner |
| 304 | self.repo = repo |
| 305 | |
| 306 | def _init_hasher(self): |
| 307 | xhash = self.xhash |
| 308 | if len(xhash) == 64: |
| 309 | self.hasher = hashlib.sha256() |
| 310 | elif len(xhash) == 32: |
| 311 | self.hasher = hashlib.md5() |
| 312 | else: |
| 313 | raise self._error('Requires sha256sum for verification') |
| 314 | self.xhash = xhash |
| 315 | |
| 316 | def _hash_check(self, f): |
| 317 | with open(f, 'rb') as fin: |
| 318 | while True: |
| 319 | d = fin.read(4096) |
| 320 | if not d: |
| 321 | break |
| 322 | self.hasher.update(d) |
| 323 | xhash = self.hasher.hexdigest() |
| 324 | if xhash != self.xhash: |
| 325 | raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash)) |
| 326 | |
| 327 | def _init_commit_ts(self): |
| 328 | if self.commit_ts is not None: |
| 329 | return |
| 330 | # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more |
| 331 | # terse while API[2] provides more verbose info such as commit diff |
| 332 | # etc. That's the main reason why API[1] is preferred: the response |
| 333 | # size is predictable. |
| 334 | # |
| 335 | # However, API[1] only accepts complete commit sha1sum as the parameter |
| 336 | # while API[2] is more liberal accepting also partial commit id and |
| 337 | # tags, etc. |
| 338 | # |
| 339 | # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit |
| 340 | # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit |
| 341 | apis = [ |
| 342 | { |
| 343 | 'url': self._make_repo_url_path('git', 'commits', self.version), |
| 344 | 'attr_path': ('committer', 'date'), |
| 345 | }, { |
| 346 | 'url': self._make_repo_url_path('commits', self.version), |
| 347 | 'attr_path': ('commit', 'committer', 'date'), |
| 348 | }, |
| 349 | ] |
| 350 | version_is_sha1sum = len(self.version) == 40 |
| 351 | if not version_is_sha1sum: |
| 352 | apis.insert(0, apis.pop()) |
| 353 | reasons = '' |
| 354 | for api in apis: |
| 355 | url = api['url'] |
| 356 | attr_path = api['attr_path'] |
| 357 | try: |
| 358 | ct = self.commit_ts_cache.get(url) |
| 359 | if ct is not None: |
| 360 | self.commit_ts = ct |
| 361 | return |
| 362 | ct = self._init_commit_ts_remote_get(url, attr_path) |
| 363 | self.commit_ts = ct |
| 364 | self.commit_ts_cache.set(url, ct) |
| 365 | return |
| 366 | except Exception as e: |
| 367 | reasons += '\n' + (" {}: {}".format(url, e)) |
| 368 | raise self._error('Cannot fetch commit ts:{}'.format(reasons)) |
| 369 | |
| 370 | def _init_commit_ts_remote_get(self, url, attrpath): |
| 371 | resp = self._make_request(url) |
| 372 | data = resp.read() |
| 373 | date = json.loads(data) |
| 374 | for attr in attrpath: |
| 375 | date = date[attr] |
| 376 | date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ') |
| 377 | date = date.timetuple() |
| 378 | ct = calendar.timegm(date) |
| 379 | return ct |
| 380 | |
| 381 | def _fetch(self, path): |
| 382 | """Fetch tarball of the specified version ref.""" |
| 383 | ref = self.version |
| 384 | url = self._make_repo_url_path('tarball', ref) |
| 385 | resp = self._make_request(url) |
| 386 | with open(path, 'wb') as fout: |
| 387 | while True: |
| 388 | d = resp.read(4096) |
| 389 | if not d: |
| 390 | break |
| 391 | fout.write(d) |
| 392 | |
| 393 | def _make_repo_url_path(self, *args): |
| 394 | url = '/repos/{0}/{1}'.format(self.owner, self.repo) |
| 395 | if args: |
| 396 | url += '/' + '/'.join(args) |
| 397 | return url |
| 398 | |
| 399 | def _make_request(self, path): |
| 400 | """Request GitHub API endpoint on ``path``.""" |
| 401 | url = 'https://api.github.com' + path |
| 402 | headers = { |
| 403 | 'Accept': 'application/vnd.github.v3+json', |
| 404 | 'User-Agent': 'OpenWrt', |
| 405 | } |
| 406 | req = urllib.request.Request(url, headers=headers) |
| 407 | sslcontext = ssl._create_unverified_context() |
| 408 | fileobj = urllib.request.urlopen(req, context=sslcontext) |
| 409 | return fileobj |
| 410 | |
| 411 | def _error(self, msg): |
| 412 | return DownloadGitHubError('{}: {}'.format(self.source, msg)) |
| 413 | |
| 414 | |
| 415 | def main(): |
| 416 | parser = argparse.ArgumentParser() |
| 417 | parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir') |
| 418 | parser.add_argument('--url', help='Download URL') |
| 419 | parser.add_argument('--subdir', help='Source code subdir name') |
| 420 | parser.add_argument('--version', help='Source code version') |
| 421 | parser.add_argument('--source', help='Source tarball filename') |
| 422 | parser.add_argument('--hash', help='Source tarball\'s expected sha256sum') |
| 423 | parser.add_argument('--submodules', nargs='*', help='List of submodules, or "skip"') |
| 424 | args = parser.parse_args() |
| 425 | try: |
| 426 | method = DownloadGitHubTarball(args) |
| 427 | method.download() |
| 428 | except Exception as ex: |
| 429 | sys.stderr.write('{}: Download from {} failed\n'.format(args.source, args.url)) |
| 430 | sys.stderr.write('{}\n'.format(ex)) |
| 431 | sys.exit(1) |
| 432 | |
| 433 | if __name__ == '__main__': |
| 434 | main() |