| #!/usr/bin/env python3 |
| # |
| # Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com> |
| # |
| # This is free software, licensed under the GNU General Public License v2. |
| # See /LICENSE for more information. |
| |
| import argparse |
| import calendar |
| import datetime |
| import errno |
| import fcntl |
| import hashlib |
| import json |
| import os |
| import os.path |
| import re |
| import shutil |
| import ssl |
| import subprocess |
| import sys |
| import time |
| import urllib.request |
| |
| TMPDIR = os.environ.get('TMP_DIR') or '/tmp' |
| TMPDIR_DL = os.path.join(TMPDIR, 'dl') |
| |
| |
| class PathException(Exception): pass |
| class DownloadGitHubError(Exception): pass |
| |
| |
| class Path(object): |
| """Context class for preparing and cleaning up directories. |
| |
| If ```preclean` is ``False``, ``path`` will NOT be removed on context enter |
| |
| If ``path`` ``isdir``, then it will be created on context enter. |
| |
| If ``keep`` is True, then ``path`` will NOT be removed on context exit |
| """ |
| |
| def __init__(self, path, isdir=True, preclean=False, keep=False): |
| self.path = path |
| self.isdir = isdir |
| self.preclean = preclean |
| self.keep = keep |
| |
| def __enter__(self): |
| if self.preclean: |
| self.rm_all(self.path) |
| if self.isdir: |
| self.mkdir_all(self.path) |
| return self |
| |
| def __exit__(self, exc_type, exc_value, traceback): |
| if not self.keep: |
| self.rm_all(self.path) |
| |
| @staticmethod |
| def mkdir_all(path): |
| """Same as mkdir -p.""" |
| names = os.path.split(path) |
| p = '' |
| for name in names: |
| p = os.path.join(p, name) |
| Path._mkdir(p) |
| |
| @staticmethod |
| def _rmdir_dir(dir_): |
| names = Path._listdir(dir_) |
| for name in names: |
| p = os.path.join(dir_, name) |
| Path.rm_all(p) |
| Path._rmdir(dir_) |
| |
| @staticmethod |
| def _mkdir(path): |
| Path._os_func(os.mkdir, path, errno.EEXIST) |
| |
| @staticmethod |
| def _rmdir(path): |
| Path._os_func(os.rmdir, path, errno.ENOENT) |
| |
| @staticmethod |
| def _remove(path): |
| Path._os_func(os.remove, path, errno.ENOENT) |
| |
| @staticmethod |
| def _listdir(path): |
| return Path._os_func(os.listdir, path, errno.ENOENT, default=[]) |
| |
| @staticmethod |
| def _os_func(func, path, errno, default=None): |
| """Call func(path) in an idempotent way. |
| |
| On exception ``ex``, if the type is OSError and ``ex.errno == errno``, |
| return ``default``, otherwise, re-raise |
| """ |
| try: |
| return func(path) |
| except OSError as e: |
| if e.errno == errno: |
| return default |
| else: |
| raise |
| |
| @staticmethod |
| def rm_all(path): |
| """Same as rm -r.""" |
| if os.path.islink(path): |
| Path._remove(path) |
| elif os.path.isdir(path): |
| Path._rmdir_dir(path) |
| else: |
| Path._remove(path) |
| |
| @staticmethod |
| def untar(path, into=None): |
| """Extract tarball at ``path`` into subdir ``into``. |
| |
| return subdir name if and only if there exists one, otherwise raise PathException |
| """ |
| args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions') |
| subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22)) |
| dirs = os.listdir(into) |
| if len(dirs) == 1: |
| return dirs[0] |
| else: |
| raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs)) |
| |
| @staticmethod |
| def tar(path, subdir, into=None, ts=None): |
| """Pack ``path`` into tarball ``into``.""" |
| # --sort=name requires a recent build of GNU tar |
| args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name', '--mode=a-s'] |
| args += ['-C', path, '-cf', into, subdir] |
| envs = os.environ.copy() |
| if ts is not None: |
| args.append('--mtime=@%d' % ts) |
| if into.endswith('.zst'): |
| args.append('-I zstd -T0 --ultra -20') |
| elif into.endswith('.xz'): |
| envs['XZ_OPT'] = '-7e' |
| args.append('-J') |
| elif into.endswith('.bz2'): |
| args.append('-j') |
| elif into.endswith('.gz'): |
| args.append('-z') |
| envs['GZIP'] = '-n' |
| else: |
| raise PathException('unknown compression type %s' % into) |
| subprocess.check_call(args, env=envs) |
| |
| |
| class GitHubCommitTsCache(object): |
| __cachef = 'github.commit.ts.cache' |
| __cachen = 2048 |
| |
| def __init__(self): |
| Path.mkdir_all(TMPDIR_DL) |
| self.cachef = os.path.join(TMPDIR_DL, self.__cachef) |
| self.cache = {} |
| |
| def get(self, k): |
| """Get timestamp with key ``k``.""" |
| fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT) |
| with os.fdopen(fileno) as fin: |
| try: |
| fcntl.lockf(fileno, fcntl.LOCK_SH) |
| self._cache_init(fin) |
| if k in self.cache: |
| ts = self.cache[k][0] |
| return ts |
| finally: |
| fcntl.lockf(fileno, fcntl.LOCK_UN) |
| return None |
| |
| def set(self, k, v): |
| """Update timestamp with ``k``.""" |
| fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT) |
| with os.fdopen(fileno, 'w+') as f: |
| try: |
| fcntl.lockf(fileno, fcntl.LOCK_EX) |
| self._cache_init(f) |
| self.cache[k] = (v, int(time.time())) |
| self._cache_flush(f) |
| finally: |
| fcntl.lockf(fileno, fcntl.LOCK_UN) |
| |
| def _cache_init(self, fin): |
| for line in fin: |
| k, ts, updated = line.split() |
| ts = int(ts) |
| updated = int(updated) |
| self.cache[k] = (ts, updated) |
| |
| def _cache_flush(self, fout): |
| cache = sorted(self.cache.items(), key=lambda a: a[1][1]) |
| cache = cache[:self.__cachen] |
| self.cache = {} |
| os.ftruncate(fout.fileno(), 0) |
| fout.seek(0, os.SEEK_SET) |
| for k, ent in cache: |
| ts = ent[0] |
| updated = ent[1] |
| line = '{0} {1} {2}\n'.format(k, ts, updated) |
| fout.write(line) |
| |
| |
| class DownloadGitHubTarball(object): |
| """Download and repack archive tarball from GitHub. |
| |
| Compared with the method of packing after cloning the whole repo, this |
| method is more friendly to users with fragile internet connection. |
| |
| However, there are limitations with this method |
| |
| - GitHub imposes a 60 reqs/hour limit for unauthenticated API access. |
| This affects fetching commit date for reproducible tarballs. Download |
| through the archive link is not affected. |
| |
| - GitHub archives do not contain source codes for submodules. |
| |
| - GitHub archives seem to respect .gitattributes and ignore paths with |
| export-ignore attributes. |
| |
| For the first two issues, the method will fail loudly to allow fallback to |
| clone-then-pack method. |
| |
| As for the 3rd issue, to make sure that this method only produces identical |
| tarballs as the fallback method, we require the expected hash value to be |
| supplied. That means the first tarball will need to be prepared by the |
| clone-then-pack method |
| """ |
| |
| __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)') |
| |
| def __init__(self, args): |
| self.dl_dir = args.dl_dir |
| self.version = args.version |
| self.subdir = args.subdir |
| self.source = args.source |
| self.submodules = args.submodules |
| self.url = args.url |
| self._init_owner_repo() |
| self.xhash = args.hash |
| self._init_hasher() |
| self.commit_ts = None # lazy load commit timestamp |
| self.commit_ts_cache = GitHubCommitTsCache() |
| self.name = 'github-tarball' |
| |
| def download(self): |
| """Download and repack GitHub archive tarball.""" |
| if self.submodules and self.submodules != ['skip']: |
| raise self._error('Fetching submodules is not yet supported') |
| self._init_commit_ts() |
| with Path(TMPDIR_DL, keep=True) as dir_dl: |
| # fetch tarball from GitHub |
| tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl') |
| with Path(tarball_path, isdir=False): |
| self._fetch(tarball_path) |
| # unpack |
| d = os.path.join(dir_dl.path, self.subdir + '.untar') |
| with Path(d, preclean=True) as dir_untar: |
| tarball_prefix = Path.untar(tarball_path, into=dir_untar.path) |
| dir0 = os.path.join(dir_untar.path, tarball_prefix) |
| dir1 = os.path.join(dir_untar.path, self.subdir) |
| # submodules check |
| if self.submodules != ['skip'] and self._has_submodule(dir0): |
| raise self._error('Fetching submodules is not yet supported') |
| # rename subdir |
| os.rename(dir0, dir1) |
| # repack |
| into=os.path.join(TMPDIR_DL, self.source) |
| Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts) |
| try: |
| self._hash_check(into) |
| except Exception: |
| Path.rm_all(into) |
| raise |
| # move to target location |
| file1 = os.path.join(self.dl_dir, self.source) |
| if into != file1: |
| shutil.move(into, file1) |
| |
| def _has_submodule(self, dir_): |
| m = os.path.join(dir_, '.gitmodules') |
| try: |
| st = os.stat(m) |
| return st.st_size > 0 |
| except OSError as e: |
| return e.errno != errno.ENOENT |
| |
| def _init_owner_repo(self): |
| m = self.__repo_url_regex.search(self.url) |
| if m is None: |
| raise self._error('Invalid github url: {}'.format(self.url)) |
| owner = m.group('owner') |
| repo = m.group('repo') |
| if repo.endswith('.git'): |
| repo = repo[:-4] |
| self.owner = owner |
| self.repo = repo |
| |
| def _init_hasher(self): |
| xhash = self.xhash |
| if len(xhash) == 64: |
| self.hasher = hashlib.sha256() |
| elif len(xhash) == 32: |
| self.hasher = hashlib.md5() |
| else: |
| raise self._error('Requires sha256sum for verification') |
| self.xhash = xhash |
| |
| def _hash_check(self, f): |
| with open(f, 'rb') as fin: |
| while True: |
| d = fin.read(4096) |
| if not d: |
| break |
| self.hasher.update(d) |
| xhash = self.hasher.hexdigest() |
| if xhash != self.xhash: |
| raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash)) |
| |
| def _init_commit_ts(self): |
| if self.commit_ts is not None: |
| return |
| # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more |
| # terse while API[2] provides more verbose info such as commit diff |
| # etc. That's the main reason why API[1] is preferred: the response |
| # size is predictable. |
| # |
| # However, API[1] only accepts complete commit sha1sum as the parameter |
| # while API[2] is more liberal accepting also partial commit id and |
| # tags, etc. |
| # |
| # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit |
| # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit |
| apis = [ |
| { |
| 'url': self._make_repo_url_path('git', 'commits', self.version), |
| 'attr_path': ('committer', 'date'), |
| }, { |
| 'url': self._make_repo_url_path('commits', self.version), |
| 'attr_path': ('commit', 'committer', 'date'), |
| }, |
| ] |
| version_is_sha1sum = len(self.version) == 40 |
| if not version_is_sha1sum: |
| apis.insert(0, apis.pop()) |
| reasons = '' |
| for api in apis: |
| url = api['url'] |
| attr_path = api['attr_path'] |
| try: |
| ct = self.commit_ts_cache.get(url) |
| if ct is not None: |
| self.commit_ts = ct |
| return |
| ct = self._init_commit_ts_remote_get(url, attr_path) |
| self.commit_ts = ct |
| self.commit_ts_cache.set(url, ct) |
| return |
| except Exception as e: |
| reasons += '\n' + (" {}: {}".format(url, e)) |
| raise self._error('Cannot fetch commit ts:{}'.format(reasons)) |
| |
| def _init_commit_ts_remote_get(self, url, attrpath): |
| resp = self._make_request(url) |
| data = resp.read() |
| date = json.loads(data) |
| for attr in attrpath: |
| date = date[attr] |
| date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ') |
| date = date.timetuple() |
| ct = calendar.timegm(date) |
| return ct |
| |
| def _fetch(self, path): |
| """Fetch tarball of the specified version ref.""" |
| ref = self.version |
| url = self._make_repo_url_path('tarball', ref) |
| resp = self._make_request(url) |
| with open(path, 'wb') as fout: |
| while True: |
| d = resp.read(4096) |
| if not d: |
| break |
| fout.write(d) |
| |
| def _make_repo_url_path(self, *args): |
| url = '/repos/{0}/{1}'.format(self.owner, self.repo) |
| if args: |
| url += '/' + '/'.join(args) |
| return url |
| |
| def _make_request(self, path): |
| """Request GitHub API endpoint on ``path``.""" |
| url = 'https://api.github.com' + path |
| headers = { |
| 'Accept': 'application/vnd.github.v3+json', |
| 'User-Agent': 'OpenWrt', |
| } |
| req = urllib.request.Request(url, headers=headers) |
| sslcontext = ssl._create_unverified_context() |
| fileobj = urllib.request.urlopen(req, context=sslcontext) |
| return fileobj |
| |
| def _error(self, msg): |
| return DownloadGitHubError('{}: {}'.format(self.source, msg)) |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir') |
| parser.add_argument('--url', help='Download URL') |
| parser.add_argument('--subdir', help='Source code subdir name') |
| parser.add_argument('--version', help='Source code version') |
| parser.add_argument('--source', help='Source tarball filename') |
| parser.add_argument('--hash', help='Source tarball\'s expected sha256sum') |
| parser.add_argument('--submodules', nargs='*', help='List of submodules, or "skip"') |
| args = parser.parse_args() |
| try: |
| method = DownloadGitHubTarball(args) |
| method.download() |
| except Exception as ex: |
| sys.stderr.write('{}: Download from {} failed\n'.format(args.source, args.url)) |
| sys.stderr.write('{}\n'.format(ex)) |
| sys.exit(1) |
| |
| if __name__ == '__main__': |
| main() |