]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/data/buildtool/filtration.py
ICU-64232.0.1.tar.gz
[apple/icu.git] / icuSources / data / buildtool / filtration.py
diff --git a/icuSources/data/buildtool/filtration.py b/icuSources/data/buildtool/filtration.py
new file mode 100644 (file)
index 0000000..48d78e3
--- /dev/null
@@ -0,0 +1,385 @@
+# Copyright (C) 2018 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+
+# Python 2/3 Compatibility (ICU-20299)
+# TODO(ICU-20301): Remove this.
+from __future__ import print_function
+
+from abc import abstractmethod
+from collections import defaultdict
+import re
+import sys
+
+from . import *
+from . import utils
+from .locale_dependencies import data as DEPENDENCY_DATA
+from .request_types import *
+
+
+# Note: for this to be a proper abstract class, it should extend abc.ABC.
+# There is no nice way to do this that works in both Python 2 and 3.
+# TODO(ICU-20301): Make this inherit from abc.ABC.
+class Filter(object):
+    @staticmethod
+    def create_from_json(json_data):
+        if "filterType" in json_data:
+            filter_type = json_data["filterType"]
+        else:
+            filter_type = "file-stem"
+
+        if filter_type == "file-stem":
+            return FileStemFilter(json_data)
+        elif filter_type == "language":
+            return LanguageFilter(json_data)
+        elif filter_type == "regex":
+            return RegexFilter(json_data)
+        elif filter_type == "exclude":
+            return ExclusionFilter()
+        elif filter_type == "union":
+            return UnionFilter(json_data)
+        elif filter_type == "locale":
+            return LocaleFilter(json_data)
+        else:
+            print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr)
+            return None
+
+    def filter(self, request):
+        if not request.apply_file_filter(self):
+            return []
+        for file in request.all_input_files():
+            assert self.match(file)
+        return [request]
+
+    @classmethod
+    def _file_to_file_stem(cls, file):
+        start = file.filename.rfind("/")
+        limit = file.filename.rfind(".")
+        return file.filename[start+1:limit]
+
+    @abstractmethod
+    def match(self, file):
+        pass
+
+
+class InclusionFilter(Filter):
+    def match(self, file):
+        return True
+
+
+class ExclusionFilter(Filter):
+    def match(self, file):
+        return False
+
+
+class WhitelistBlacklistFilter(Filter):
+    def __init__(self, json_data):
+        if "whitelist" in json_data:
+            self.is_whitelist = True
+            self.whitelist = json_data["whitelist"]
+        else:
+            assert "blacklist" in json_data, "Need either whitelist or blacklist: %s" % str(json_data)
+            self.is_whitelist = False
+            self.blacklist = json_data["blacklist"]
+
+    def match(self, file):
+        file_stem = self._file_to_file_stem(file)
+        return self._should_include(file_stem)
+
+    @abstractmethod
+    def _should_include(self, file_stem):
+        pass
+
+
+class FileStemFilter(WhitelistBlacklistFilter):
+    def _should_include(self, file_stem):
+        if self.is_whitelist:
+            return file_stem in self.whitelist
+        else:
+            return file_stem not in self.blacklist
+
+
+class LanguageFilter(WhitelistBlacklistFilter):
+    def _should_include(self, file_stem):
+        language = file_stem.split("_")[0]
+        if language == "root":
+            # Always include root.txt
+            return True
+        if self.is_whitelist:
+            return language in self.whitelist
+        else:
+            return language not in self.blacklist
+
+
+class RegexFilter(WhitelistBlacklistFilter):
+    def __init__(self, *args):
+        # TODO(ICU-20301): Change this to: super().__init__(*args)
+        super(RegexFilter, self).__init__(*args)
+        if self.is_whitelist:
+            self.whitelist = [re.compile(pat) for pat in self.whitelist]
+        else:
+            self.blacklist = [re.compile(pat) for pat in self.blacklist]
+
+    def _should_include(self, file_stem):
+        if self.is_whitelist:
+            for pattern in self.whitelist:
+                if pattern.match(file_stem):
+                    return True
+            return False
+        else:
+            for pattern in self.blacklist:
+                if pattern.match(file_stem):
+                    return False
+            return True
+
+
+class UnionFilter(Filter):
+    def __init__(self, json_data):
+        # Collect the sub-filters.
+        self.sub_filters = []
+        for filter_json in json_data["unionOf"]:
+            self.sub_filters.append(Filter.create_from_json(filter_json))
+
+    def match(self, file):
+        """Match iff any of the sub-filters match."""
+        for filter in self.sub_filters:
+            if filter.match(file):
+                return True
+        return False
+
+
+LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$")
+LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$")
+
+class LocaleFilter(Filter):
+    def __init__(self, json_data):
+        self.locales_requested = set()
+        self.locales_required = set()
+        self.include_children = json_data.get("includeChildren", True)
+        self.include_scripts = json_data.get("includeScripts", False)
+
+        # Compute the requested and required locales.
+        for locale in json_data["whitelist"]:
+            self._add_locale_and_parents(locale)
+
+    def _add_locale_and_parents(self, locale):
+        # Store the locale as *requested*
+        self.locales_requested.add(locale)
+        # Store the locale and its dependencies as *required*
+        while locale is not None:
+            self.locales_required.add(locale)
+            locale = self._get_parent_locale(locale)
+
+    def match(self, file):
+        locale = self._file_to_file_stem(file)
+
+        # A locale is *required* if it is *requested* or an ancestor of a
+        # *requested* locale.
+        if locale in self.locales_required:
+            return True
+
+        # Resolve include_scripts and include_children.
+        return self._match_recursive(locale)
+
+    def _match_recursive(self, locale):
+        # Base case: return True if we reached a *requested* locale,
+        # or False if we ascend out of the locale tree.
+        if locale is None:
+            return False
+        if locale in self.locales_requested:
+            return True
+
+        # Check for alternative scripts.
+        # This causes sr_Latn to check sr instead of going directly to root.
+        if self.include_scripts:
+            match = LANGUAGE_SCRIPT_REGEX.match(locale)
+            if match and self._match_recursive(match.group(1)):
+                return True
+
+        # Check if we are a descendant of a *requested* locale.
+        if self.include_children:
+            parent = self._get_parent_locale(locale)
+            if self._match_recursive(parent):
+                return True
+
+        # No matches.
+        return False
+
+    @classmethod
+    def _get_parent_locale(cls, locale):
+        if locale in DEPENDENCY_DATA["parents"]:
+            return DEPENDENCY_DATA["parents"][locale]
+        if locale in DEPENDENCY_DATA["aliases"]:
+            return DEPENDENCY_DATA["aliases"][locale]
+        if LANGUAGE_ONLY_REGEX.match(locale):
+            return "root"
+        i = locale.rfind("_")
+        if i < 0:
+            return None
+        return locale[:i]
+
+
+def apply_filters(requests, config):
+    """Runs the filters and returns a new list of requests."""
+    requests = _apply_file_filters(requests, config)
+    requests = _apply_resource_filters(requests, config)
+    return requests
+
+
+def _apply_file_filters(old_requests, config):
+    """Filters out entire files."""
+    filters = _preprocess_file_filters(old_requests, config)
+    new_requests = []
+    for request in old_requests:
+        category = request.category
+        if category in filters:
+            new_requests += filters[category].filter(request)
+        else:
+            new_requests.append(request)
+    return new_requests
+
+
+def _preprocess_file_filters(requests, config):
+    all_categories = set(
+        request.category
+        for request in requests
+    )
+    all_categories.remove(None)
+    all_categories = list(sorted(all_categories))
+    json_data = config.filters_json_data
+    filters = {}
+    for category in all_categories:
+        if "featureFilters" in json_data and category in json_data["featureFilters"]:
+            filters[category] = Filter.create_from_json(
+                json_data["featureFilters"][category]
+            )
+        elif "localeFilter" in json_data and category[-5:] == "_tree":
+            filters[category] = Filter.create_from_json(
+                json_data["localeFilter"]
+            )
+    if "featureFilters" in json_data:
+        for category in json_data["featureFilters"]:
+            if category not in all_categories:
+                print("Warning: category %s is not known" % category, file=sys.stderr)
+    return filters
+
+
+class ResourceFilterInfo(object):
+    def __init__(self, category):
+        self.category = category
+        self.filter_tmp_dir = "filters/%s" % category
+        self.input_files = None
+        self.filter_files = None
+        self.rules_by_file = None
+
+    def apply_to_requests(self, all_requests):
+        # Call this method only once per list of requests.
+        assert self.input_files is None
+        for request in all_requests:
+            if request.category != self.category:
+                continue
+            if not isinstance(request, AbstractExecutionRequest):
+                continue
+            if request.tool != IcuTool("genrb"):
+                continue
+            if not request.input_files:
+                continue
+            self._set_files(request.input_files)
+            request.dep_targets += [self.filter_files[:]]
+            arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir
+            request.args = "%s %s" % (arg_str, request.args)
+
+        # Make sure we found the target request
+        if self.input_files is None:
+            print("WARNING: Category not found: %s" % self.category, file=sys.stderr)
+            self.input_files = []
+            self.filter_files = []
+            self.rules_by_file = []
+
+    def _set_files(self, files):
+        # Note: The input files to genrb for a certain category should always
+        # be the same. For example, there are often two genrb calls: one for
+        # --writePoolBundle, and the other for --usePoolBundle. They are both
+        # expected to have the same list of input files.
+        if self.input_files is not None:
+            assert self.input_files == files
+            return
+        self.input_files = list(files)
+        self.filter_files = [
+            TmpFile("%s/%s" % (self.filter_tmp_dir, basename))
+            for basename in (
+                file.filename[file.filename.rfind("/")+1:]
+                for file in files
+            )
+        ]
+        self.rules_by_file = [[] for _ in range(len(files))]
+
+    def add_rules(self, file_filter, rules):
+        for file, rule_list in zip(self.input_files, self.rules_by_file):
+            if file_filter.match(file):
+                rule_list += rules
+
+    def make_requests(self):
+        # Map from rule list to filter files with that rule list
+        unique_rules = defaultdict(list)
+        for filter_file, rules in zip(self.filter_files, self.rules_by_file):
+            unique_rules[tuple(rules)].append(filter_file)
+
+        new_requests = []
+        i = 0
+        for rules, filter_files in unique_rules.items():
+            base_filter_file = filter_files[0]
+            new_requests += [
+                PrintFileRequest(
+                    name = "%s_print_%d" % (self.category, i),
+                    output_file = base_filter_file,
+                    content = self._generate_resource_filter_txt(rules)
+                )
+            ]
+            i += 1
+            for filter_file in filter_files[1:]:
+                new_requests += [
+                    CopyRequest(
+                        name = "%s_copy_%d" % (self.category, i),
+                        input_file = base_filter_file,
+                        output_file = filter_file
+                    )
+                ]
+                i += 1
+        return new_requests
+
+    @classmethod
+    def _generate_resource_filter_txt(cls, rules):
+        result = "# Caution: This file is automatically generated\n\n"
+        result += "\n".join(rules)
+        return result
+
+
+def _apply_resource_filters(all_requests, config):
+    """Creates filters for looking within resource bundle files."""
+    json_data = config.filters_json_data
+    if "resourceFilters" not in json_data:
+        return all_requests
+
+    collected = {}
+    for entry in json_data["resourceFilters"]:
+        if "files" in entry:
+            file_filter = Filter.create_from_json(entry["files"])
+        else:
+            file_filter = InclusionFilter()
+        for category in entry["categories"]:
+            # not defaultdict because we need to pass arguments to the constructor
+            if category not in collected:
+                filter_info = ResourceFilterInfo(category)
+                filter_info.apply_to_requests(all_requests)
+                collected[category] = filter_info
+            else:
+                filter_info = collected[category]
+            filter_info.add_rules(file_filter, entry["rules"])
+
+    # Add the filter generation requests to the beginning so that by default
+    # they are made before genrb gets run (order is required by windirect)
+    new_requests = []
+    for filter_info in collected.values():
+        new_requests += filter_info.make_requests()
+    new_requests += all_requests
+    return new_requests