Factor out file search.

This commit is contained in:
Jelmer Vernooij 2021-03-16 15:26:52 +00:00
parent ceb052332b
commit 6aaa2df421
3 changed files with 299 additions and 233 deletions

View file

@ -20,6 +20,9 @@ import os
import stat
USER_AGENT = "Ognibuild"
class DetailedFailure(Exception):
def __init__(self, retcode, argv, error):
self.retcode = retcode

View file

@ -18,7 +18,7 @@
import logging
import re
from typing import List, Iterator, Optional, Set
from typing import List, Optional, Set
import os
from buildlog_consultant.apt import (
@ -27,9 +27,7 @@ from buildlog_consultant.apt import (
from .. import DetailedFailure, UnidentifiedError
from ..session import Session, run_with_tee
USER_AGENT = "Ognibuild"
from .file_search import FileSearcher, AptCachedContentsFileSearcher, GENERATED_FILE_SEARCHER, get_package_for_paths
def run_apt(session: Session, args: List[str]) -> None:
@ -46,11 +44,6 @@ def run_apt(session: Session, args: List[str]) -> None:
raise UnidentifiedError(retcode, args, lines, secondary=match)
class FileSearcher(object):
def search_files(self, path: str, regex: bool = False) -> Iterator[str]:
raise NotImplementedError(self.search_files)
class AptManager(object):
session: Session
@ -64,7 +57,7 @@ class AptManager(object):
def searchers(self):
if self._searchers is None:
self._searchers = [
AptContentsFileSearcher.from_session(self.session),
AptCachedContentsFileSearcher.from_session(self.session),
GENERATED_FILE_SEARCHER,
]
return self._searchers
@ -108,226 +101,3 @@ class AptManager(object):
def satisfy_command(self, deps: List[str]) -> List[str]:
return ["apt", "satisfy"] + deps
class ContentsFileNotFound(Exception):
"""The contents file was not found."""
def read_contents_file(f):
for line in f:
(path, rest) = line.rsplit(maxsplit=1)
package = rest.split(b"/")[-1]
decoded_path = "/" + path.decode("utf-8", "surrogateescape")
yield decoded_path, package.decode("utf-8")
def url_to_cache_filename(url):
from urllib.parse import urlparse
parsed = urlparse(url)
return parsed.hostname + parsed.path.replace("/", "_")
def contents_urls_from_sourceslist(sl, arch):
# TODO(jelmer): Verify signatures, etc.
arches = [(arch, True), ("all", False)]
for source in sl.list:
if source.invalid or source.disabled:
continue
if source.type == "deb-src":
continue
if source.type != "deb":
logging.warning("Invalid line in sources: %r", source)
continue
base_url = source.uri.rstrip("/")
name = source.dist.rstrip("/")
components = source.comps
if components:
dists_url = base_url + "/dists"
else:
dists_url = base_url
if components:
for component in components:
for arch, mandatory in arches:
yield (
"%s/%s/%s/Contents-%s"
% (dists_url, name, component, arch),
mandatory,
)
else:
for arch, mandatory in arches:
yield (
"%s/%s/Contents-%s" % (dists_url, name.rstrip("/"), arch),
mandatory,
)
def load_contents_url(url):
from urllib.error import HTTPError
from urllib.request import urlopen, Request
for ext in [".xz", ".gz", ""]:
try:
request = Request(
url + ext, headers={"User-Agent": USER_AGENT})
response = urlopen(request)
except HTTPError as e:
if e.status == 404:
continue
raise
break
else:
raise ContentsFileNotFound(url)
if ext == ".gz":
import gzip
f = gzip.GzipFile(fileobj=response)
elif ext == ".xz":
import lzma
from io import BytesIO
f = BytesIO(lzma.decompress(response.read()))
elif response.headers.get_content_type() == "text/plain":
f = response
else:
raise Exception(
"Unknown content type %r" % response.headers.get_content_type()
)
return f
class AptContentsFileSearcher(FileSearcher):
def __init__(self):
self._db = {}
@classmethod
def from_session(cls, session):
logging.info("Loading apt contents information")
# TODO(jelmer): what about sources.list.d?
from aptsources.sourceslist import SourcesList
sl = SourcesList()
sl.load(os.path.join(session.location, "etc/apt/sources.list"))
return cls.from_sources_list(
sl,
cache_dirs=[
os.path.join(session.location, "var/lib/apt/lists"),
"/var/lib/apt/lists",
],
)
def __setitem__(self, path, package):
self._db[path] = package
def search_files(self, path, regex=False):
if regex:
c = re.compile(path)
for p, pkg in sorted(self._db.items()):
if c.match(p):
yield pkg
else:
try:
return self._db[path]
except KeyError:
pass
def load_file(self, f):
for path, package in read_contents_file(f):
self[path] = package
@classmethod
def _load_cache_file(cls, url, cache_dir):
fn = url_to_cache_filename(url)
p = os.path.join(cache_dir, fn + ".lz4")
if not os.path.exists(p):
return None
logging.debug("Loading cached contents file %s", p)
import lz4.frame
return lz4.frame.open(p, mode="rb")
@classmethod
def from_urls(cls, urls, cache_dirs=None):
self = cls()
for url, mandatory in urls:
for cache_dir in cache_dirs or []:
f = cls._load_cache_file(url, cache_dir)
if f is not None:
self.load_file(f)
break
else:
if not mandatory and self._db:
logging.debug(
"Not attempting to fetch optional contents " "file %s", url
)
else:
logging.debug("Fetching contents file %s", url)
try:
self.load_url(url)
except ContentsFileNotFound:
if mandatory:
logging.warning("Unable to fetch contents file %s", url)
else:
logging.debug(
"Unable to fetch optional contents file %s", url
)
return self
@classmethod
def from_sources_list(cls, sl, cache_dirs=None):
# TODO(jelmer): Use aptsources.sourceslist.SourcesList
from .build import get_build_architecture
urls = list(
contents_urls_from_sourceslist(sl, get_build_architecture()))
return cls.from_urls(urls, cache_dirs=cache_dirs)
def load_url(self, url, allow_cache=True):
f = load_contents_url(url)
self.load_file(f)
class GeneratedFileSearcher(FileSearcher):
def __init__(self, db):
self._db = db
def search_files(self, path: str, regex: bool = False) -> Iterator[str]:
for p, pkg in sorted(self._db.items()):
if regex:
if re.match(path, p):
yield pkg
else:
if path == p:
yield pkg
# TODO(jelmer): read from a file
GENERATED_FILE_SEARCHER = GeneratedFileSearcher(
{
"/etc/locale.gen": "locales",
# Alternative
"/usr/bin/rst2html": "/usr/share/docutils/scripts/python3/rst2html",
}
)
def get_package_for_paths(
paths: List[str], searchers: List[FileSearcher], regex: bool = False
) -> Optional[str]:
candidates: Set[str] = set()
for path in paths:
for searcher in searchers:
candidates.update(searcher.search_files(path, regex=regex))
if candidates:
break
if len(candidates) == 0:
logging.debug("No packages found that contain %r", paths)
return None
if len(candidates) > 1:
logging.warning(
"More than 1 packages found that contain %r: %r", path, candidates
)
# Euhr. Pick the one with the shortest name?
return sorted(candidates, key=len)[0]
else:
return candidates.pop()

View file

@ -0,0 +1,293 @@
#!/usr/bin/python
# Copyright (C) 2019-2020 Jelmer Vernooij <jelmer@jelmer.uk>
# encoding: utf-8
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
from datetime import datetime
import os
import re
from typing import Iterator, List, Optional, Set
import logging
from .. import USER_AGENT
class FileSearcher(object):
def search_files(self, path: str, regex: bool = False) -> Iterator[str]:
raise NotImplementedError(self.search_files)
class ContentsFileNotFound(Exception):
"""The contents file was not found."""
def read_contents_file(f):
for line in f:
(path, rest) = line.rsplit(maxsplit=1)
yield path, rest
def url_to_cache_filename(url):
from urllib.parse import urlparse
parsed = urlparse(url)
return parsed.hostname + parsed.path.replace("/", "_")
def contents_urls_from_sourceslist(sl, arch):
# TODO(jelmer): Verify signatures, etc.
arches = [(arch, True), ("all", False)]
for source in sl.list:
if source.invalid or source.disabled:
continue
if source.type == "deb-src":
continue
if source.type != "deb":
logging.warning("Invalid line in sources: %r", source)
continue
base_url = source.uri.rstrip("/")
name = source.dist.rstrip("/")
components = source.comps
if components:
dists_url = base_url + "/dists"
else:
dists_url = base_url
if components:
for component in components:
for arch, mandatory in arches:
yield (
"%s/%s/%s/Contents-%s"
% (dists_url, name, component, arch),
mandatory,
)
else:
for arch, mandatory in arches:
yield (
"%s/%s/Contents-%s" % (dists_url, name.rstrip("/"), arch),
mandatory,
)
def load_contents_url(url):
from urllib.error import HTTPError
from urllib.request import urlopen, Request
for ext in [".xz", ".gz", ""]:
try:
request = Request(
url + ext, headers={"User-Agent": USER_AGENT})
response = urlopen(request)
except HTTPError as e:
if e.status == 404:
continue
raise
break
else:
raise ContentsFileNotFound(url)
if ext == ".gz":
import gzip
f = gzip.GzipFile(fileobj=response)
elif ext == ".xz":
import lzma
from io import BytesIO
f = BytesIO(lzma.decompress(response.read()))
elif response.headers.get_content_type() == "text/plain":
f = response
else:
raise Exception(
"Unknown content type %r" % response.headers.get_content_type()
)
return f
def load_apt_cache_file(cache_dir, url):
fn = url_to_cache_filename(url)
p = os.path.join(cache_dir, fn + ".lz4")
if not os.path.exists(p):
return None
logging.debug("Loading cached contents file %s", p)
#return os.popen('/usr/lib/apt/apt-helper cat-file %s' % p)
import lz4.frame
return lz4.frame.open(p, mode="rb")
class AptCachedContentsFileSearcher(FileSearcher):
def __init__(self):
self._db = {}
@classmethod
def from_session(cls, session):
logging.info("Loading apt contents information")
self = cls()
self.load_from_session(session)
return self
def load_local(self):
# TODO(jelmer): what about sources.list.d?
from aptsources.sourceslist import SourcesList
sl = SourcesList()
sl.load("/etc/apt/sources.list")
from .build import get_build_architecture
urls = list(
contents_urls_from_sourceslist(sl, get_build_architecture()))
cache_dirs = set(["/var/lib/apt/lists"])
self._load_urls(urls, cache_dirs)
def load_from_session(self, session):
# TODO(jelmer): what about sources.list.d?
from aptsources.sourceslist import SourcesList
sl = SourcesList()
sl.load(os.path.join(session.location, "etc/apt/sources.list"))
from .build import get_build_architecture
urls = list(
contents_urls_from_sourceslist(sl, get_build_architecture()))
cache_dirs = [
os.path.join(session.location, "var/lib/apt/lists"),
"/var/lib/apt/lists",
]
self._load_urls(urls, cache_dirs)
def _load_urls(self, urls, cache_dirs):
for url, mandatory in urls:
for cache_dir in cache_dirs:
f = load_apt_cache_file(cache_dir, url)
if f is not None:
self.load_file(f, url)
break
else:
if not mandatory and self._db:
logging.debug(
"Not attempting to fetch optional contents " "file %s", url
)
else:
logging.debug("Fetching contents file %s", url)
try:
f = load_contents_url(url)
self.load_file(f, url)
except ContentsFileNotFound:
if mandatory:
logging.warning("Unable to fetch contents file %s", url)
else:
logging.debug(
"Unable to fetch optional contents file %s", url
)
def __setitem__(self, path, package):
self._db[path] = package
def search_files(self, path, regex=False):
path = path.lstrip('/').encode('utf-8', 'surrogateescape')
if regex:
c = re.compile(path)
ret = []
for p, rest in self._db.items():
if c.match(p):
pkg = rest.split(b"/")[-1]
ret.append((p, pkg.decode('utf-8')))
for p, pkg in sorted(ret):
yield pkg
else:
try:
yield self._db[path].split(b"/")[-1].decode('utf-8')
except KeyError:
pass
def load_file(self, f, url):
start_time = datetime.now()
for path, rest in read_contents_file(f.readlines()):
self[path] = rest
logging.debug('Read %s in %s', url, datetime.now() - start_time)
class GeneratedFileSearcher(FileSearcher):
def __init__(self, db):
self._db = db
def search_files(self, path: str, regex: bool = False) -> Iterator[str]:
for p, pkg in sorted(self._db.items()):
if regex:
if re.match(path, p):
yield pkg
else:
if path == p:
yield pkg
# TODO(jelmer): read from a file
GENERATED_FILE_SEARCHER = GeneratedFileSearcher(
{
"/etc/locale.gen": "locales",
# Alternative
"/usr/bin/rst2html": "/usr/share/docutils/scripts/python3/rst2html",
}
)
def get_package_for_paths(
paths: List[str], searchers: List[FileSearcher], regex: bool = False
) -> Optional[str]:
candidates: Set[str] = set()
for path in paths:
for searcher in searchers:
candidates.update(searcher.search_files(path, regex=regex))
if candidates:
break
if len(candidates) == 0:
logging.debug("No packages found that contain %r", paths)
return None
if len(candidates) > 1:
logging.warning(
"More than 1 packages found that contain %r: %r", path, candidates
)
# Euhr. Pick the one with the shortest name?
return sorted(candidates, key=len)[0]
else:
return candidates.pop()
def main(argv):
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('path', help='Path to search for.', type=str, nargs='*')
parser.add_argument('--regex', '-x', help='Search for regex.', action='store_true')
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
main_searcher = AptCachedContentsFileSearcher()
main_searcher.load_local()
searchers = [main_searcher, GENERATED_FILE_SEARCHER]
package = get_package_for_paths(args.path, searchers=searchers, regex=args.regex)
print(package)
if __name__ == '__main__':
import sys
sys.exit(main(sys.argv))