]> git.saurik.com Git - apple/icu.git/blob - icuSources/python/icutools/databuilder/filtration.py
ICU-66108.tar.gz
[apple/icu.git] / icuSources / python / icutools / databuilder / filtration.py
1 # Copyright (C) 2018 and later: Unicode, Inc. and others.
2 # License & terms of use: http://www.unicode.org/copyright.html
3
4 # Python 2/3 Compatibility (ICU-20299)
5 # TODO(ICU-20301): Remove this.
6 from __future__ import print_function
7
8 from abc import abstractmethod
9 from collections import defaultdict
10 import re
11 import sys
12
13 from . import *
14 from . import utils
15 from .request_types import *
16
17
18 # Note: for this to be a proper abstract class, it should extend abc.ABC.
19 # There is no nice way to do this that works in both Python 2 and 3.
20 # TODO(ICU-20301): Make this inherit from abc.ABC.
21 class Filter(object):
22 @staticmethod
23 def create_from_json(json_data, io):
24 assert io != None
25 if "filterType" in json_data:
26 filter_type = json_data["filterType"]
27 else:
28 filter_type = "file-stem"
29
30 if filter_type == "file-stem":
31 return FileStemFilter(json_data)
32 elif filter_type == "language":
33 return LanguageFilter(json_data)
34 elif filter_type == "regex":
35 return RegexFilter(json_data)
36 elif filter_type == "exclude":
37 return ExclusionFilter()
38 elif filter_type == "union":
39 return UnionFilter(json_data, io)
40 elif filter_type == "locale":
41 return LocaleFilter(json_data, io)
42 else:
43 print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr)
44 return None
45
46 def filter(self, request):
47 if not request.apply_file_filter(self):
48 return []
49 for file in request.all_input_files():
50 assert self.match(file)
51 return [request]
52
53 @staticmethod
54 def _file_to_file_stem(file):
55 start = file.filename.rfind("/")
56 limit = file.filename.rfind(".")
57 return file.filename[start+1:limit]
58
59 @staticmethod
60 def _file_to_subdir(file):
61 limit = file.filename.rfind("/")
62 if limit == -1:
63 return None
64 return file.filename[:limit]
65
66 @abstractmethod
67 def match(self, file):
68 pass
69
70
71 class InclusionFilter(Filter):
72 def match(self, file):
73 return True
74
75
76 class ExclusionFilter(Filter):
77 def match(self, file):
78 return False
79
80
81 class WhitelistBlacklistFilter(Filter):
82 def __init__(self, json_data):
83 if "whitelist" in json_data:
84 self.is_whitelist = True
85 self.whitelist = json_data["whitelist"]
86 else:
87 assert "blacklist" in json_data, "Need either whitelist or blacklist: %s" % str(json_data)
88 self.is_whitelist = False
89 self.blacklist = json_data["blacklist"]
90
91 def match(self, file):
92 file_stem = self._file_to_file_stem(file)
93 return self._should_include(file_stem)
94
95 @abstractmethod
96 def _should_include(self, file_stem):
97 pass
98
99
100 class FileStemFilter(WhitelistBlacklistFilter):
101 def _should_include(self, file_stem):
102 if self.is_whitelist:
103 return file_stem in self.whitelist
104 else:
105 return file_stem not in self.blacklist
106
107
108 class LanguageFilter(WhitelistBlacklistFilter):
109 def _should_include(self, file_stem):
110 language = file_stem.split("_")[0]
111 if language == "root":
112 # Always include root.txt
113 return True
114 if self.is_whitelist:
115 return language in self.whitelist
116 else:
117 return language not in self.blacklist
118
119
120 class RegexFilter(WhitelistBlacklistFilter):
121 def __init__(self, *args):
122 # TODO(ICU-20301): Change this to: super().__init__(*args)
123 super(RegexFilter, self).__init__(*args)
124 if self.is_whitelist:
125 self.whitelist = [re.compile(pat) for pat in self.whitelist]
126 else:
127 self.blacklist = [re.compile(pat) for pat in self.blacklist]
128
129 def _should_include(self, file_stem):
130 if self.is_whitelist:
131 for pattern in self.whitelist:
132 if pattern.match(file_stem):
133 return True
134 return False
135 else:
136 for pattern in self.blacklist:
137 if pattern.match(file_stem):
138 return False
139 return True
140
141
142 class UnionFilter(Filter):
143 def __init__(self, json_data, io):
144 # Collect the sub-filters.
145 self.sub_filters = []
146 for filter_json in json_data["unionOf"]:
147 self.sub_filters.append(Filter.create_from_json(filter_json, io))
148
149 def match(self, file):
150 """Match iff any of the sub-filters match."""
151 for filter in self.sub_filters:
152 if filter.match(file):
153 return True
154 return False
155
156
157 LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$")
158 LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$")
159
160 class LocaleFilter(Filter):
161 def __init__(self, json_data, io):
162 self.locales_requested = list(json_data["whitelist"])
163 self.include_children = json_data.get("includeChildren", True)
164 self.include_scripts = json_data.get("includeScripts", False)
165
166 # Load the dependency graph from disk
167 self.dependency_data_by_tree = {
168 tree: io.read_locale_deps(tree)
169 for tree in utils.ALL_TREES
170 }
171
172 def match(self, file):
173 tree = self._file_to_subdir(file)
174 assert tree is not None
175 locale = self._file_to_file_stem(file)
176
177 # A locale is *required* if it is *requested* or an ancestor of a
178 # *requested* locale.
179 if locale in self._locales_required(tree):
180 return True
181
182 # Resolve include_scripts and include_children.
183 return self._match_recursive(locale, tree)
184
185 def _match_recursive(self, locale, tree):
186 # Base case: return True if we reached a *requested* locale,
187 # or False if we ascend out of the locale tree.
188 if locale is None:
189 return False
190 if locale in self.locales_requested:
191 return True
192
193 # Check for alternative scripts.
194 # This causes sr_Latn to check sr instead of going directly to root.
195 if self.include_scripts:
196 match = LANGUAGE_SCRIPT_REGEX.match(locale)
197 if match and self._match_recursive(match.group(1), tree):
198 return True
199
200 # Check if we are a descendant of a *requested* locale.
201 if self.include_children:
202 parent = self._get_parent_locale(locale, tree)
203 if self._match_recursive(parent, tree):
204 return True
205
206 # No matches.
207 return False
208
209 def _get_parent_locale(self, locale, tree):
210 """Gets the parent locale in the given tree, according to dependency data."""
211 dependency_data = self.dependency_data_by_tree[tree]
212 if "parents" in dependency_data and locale in dependency_data["parents"]:
213 return dependency_data["parents"][locale]
214 if "aliases" in dependency_data and locale in dependency_data["aliases"]:
215 return dependency_data["aliases"][locale]
216 if LANGUAGE_ONLY_REGEX.match(locale):
217 return "root"
218 i = locale.rfind("_")
219 if i < 0:
220 assert locale == "root"
221 return None
222 return locale[:i]
223
224 def _locales_required(self, tree):
225 """Returns a generator of all required locales in the given tree."""
226 for locale in self.locales_requested:
227 while locale is not None:
228 yield locale
229 locale = self._get_parent_locale(locale, tree)
230
231
232 def apply_filters(requests, config, io):
233 """Runs the filters and returns a new list of requests."""
234 requests = _apply_file_filters(requests, config, io)
235 requests = _apply_resource_filters(requests, config, io)
236 return requests
237
238
239 def _apply_file_filters(old_requests, config, io):
240 """Filters out entire files."""
241 filters = _preprocess_file_filters(old_requests, config, io)
242 new_requests = []
243 for request in old_requests:
244 category = request.category
245 if category in filters:
246 new_requests += filters[category].filter(request)
247 else:
248 new_requests.append(request)
249 return new_requests
250
251
252 def _preprocess_file_filters(requests, config, io):
253 all_categories = set(
254 request.category
255 for request in requests
256 )
257 all_categories.remove(None)
258 all_categories = list(sorted(all_categories))
259 json_data = config.filters_json_data
260 filters = {}
261 default_filter_json = "exclude" if config.strategy == "additive" else "include"
262 for category in all_categories:
263 filter_json = default_filter_json
264 # Figure out the correct filter to create
265 if "featureFilters" in json_data and category in json_data["featureFilters"]:
266 filter_json = json_data["featureFilters"][category]
267 if filter_json == "include" and "localeFilter" in json_data and category.endswith("_tree"):
268 filter_json = json_data["localeFilter"]
269 # Resolve the filter JSON into a filter object
270 if filter_json == "exclude":
271 filters[category] = ExclusionFilter()
272 elif filter_json == "include":
273 pass # no-op
274 else:
275 filters[category] = Filter.create_from_json(filter_json, io)
276 if "featureFilters" in json_data:
277 for category in json_data["featureFilters"]:
278 if category not in all_categories:
279 print("Warning: category %s is not known" % category, file=sys.stderr)
280 return filters
281
282
283 class ResourceFilterInfo(object):
284 def __init__(self, category, strategy):
285 self.category = category
286 self.strategy = strategy
287 self.filter_tmp_dir = "filters/%s" % category
288 self.input_files = None
289 self.filter_files = None
290 self.rules_by_file = None
291
292 def apply_to_requests(self, all_requests):
293 # Call this method only once per list of requests.
294 assert self.input_files is None
295 for request in all_requests:
296 if request.category != self.category:
297 continue
298 if not isinstance(request, AbstractExecutionRequest):
299 continue
300 if request.tool != IcuTool("genrb"):
301 continue
302 if not request.input_files:
303 continue
304 self._set_files(request.input_files)
305 request.dep_targets += [self.filter_files[:]]
306 arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir
307 request.args = "%s %s" % (arg_str, request.args)
308
309 # Make sure we found the target request
310 if self.input_files is None:
311 print("WARNING: Category not found: %s" % self.category, file=sys.stderr)
312 self.input_files = []
313 self.filter_files = []
314 self.rules_by_file = []
315
316 def _set_files(self, files):
317 # Note: The input files to genrb for a certain category should always
318 # be the same. For example, there are often two genrb calls: one for
319 # --writePoolBundle, and the other for --usePoolBundle. They are both
320 # expected to have the same list of input files.
321 if self.input_files is not None:
322 assert self.input_files == files
323 return
324 self.input_files = list(files)
325 self.filter_files = [
326 TmpFile("%s/%s" % (self.filter_tmp_dir, basename))
327 for basename in (
328 file.filename[file.filename.rfind("/")+1:]
329 for file in files
330 )
331 ]
332 if self.strategy == "additive":
333 self.rules_by_file = [
334 [r"-/", r"+/%%ALIAS", r"+/%%Parent"]
335 for _ in range(len(files))
336 ]
337 else:
338 self.rules_by_file = [
339 [r"+/"]
340 for _ in range(len(files))
341 ]
342
343 def add_rules(self, file_filter, rules):
344 for file, rule_list in zip(self.input_files, self.rules_by_file):
345 if file_filter.match(file):
346 rule_list += rules
347
348 def make_requests(self):
349 # Map from rule list to filter files with that rule list
350 unique_rules = defaultdict(list)
351 for filter_file, rules in zip(self.filter_files, self.rules_by_file):
352 unique_rules[tuple(rules)].append(filter_file)
353
354 new_requests = []
355 i = 0
356 for rules, filter_files in unique_rules.items():
357 base_filter_file = filter_files[0]
358 new_requests += [
359 PrintFileRequest(
360 name = "%s_print_%d" % (self.category, i),
361 output_file = base_filter_file,
362 content = self._generate_resource_filter_txt(rules)
363 )
364 ]
365 i += 1
366 for filter_file in filter_files[1:]:
367 new_requests += [
368 CopyRequest(
369 name = "%s_copy_%d" % (self.category, i),
370 input_file = base_filter_file,
371 output_file = filter_file
372 )
373 ]
374 i += 1
375 return new_requests
376
377 @staticmethod
378 def _generate_resource_filter_txt(rules):
379 result = "# Caution: This file is automatically generated\n\n"
380 result += "\n".join(rules)
381 return result
382
383
384 def _apply_resource_filters(all_requests, config, io):
385 """Creates filters for looking within resource bundle files."""
386 json_data = config.filters_json_data
387 if "resourceFilters" not in json_data:
388 return all_requests
389
390 collected = {}
391 for entry in json_data["resourceFilters"]:
392 if "files" in entry:
393 file_filter = Filter.create_from_json(entry["files"], io)
394 else:
395 file_filter = InclusionFilter()
396 for category in entry["categories"]:
397 # not defaultdict because we need to pass arguments to the constructor
398 if category not in collected:
399 filter_info = ResourceFilterInfo(category, config.strategy)
400 filter_info.apply_to_requests(all_requests)
401 collected[category] = filter_info
402 else:
403 filter_info = collected[category]
404 filter_info.add_rules(file_filter, entry["rules"])
405
406 # Add the filter generation requests to the beginning so that by default
407 # they are made before genrb gets run (order is required by windirect)
408 new_requests = []
409 for filter_info in collected.values():
410 new_requests += filter_info.make_requests()
411 new_requests += all_requests
412 return new_requests