]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/buildtool/filtration.py
ICU-64260.0.1.tar.gz
[apple/icu.git] / icuSources / data / buildtool / filtration.py
1 # Copyright (C) 2018 and later: Unicode, Inc. and others.
2 # License & terms of use: http://www.unicode.org/copyright.html
3
4 # Python 2/3 Compatibility (ICU-20299)
5 # TODO(ICU-20301): Remove this.
6 from __future__ import print_function
7
8 from abc import abstractmethod
9 from collections import defaultdict
10 import re
11 import sys
12
13 from . import *
14 from . import utils
15 from .locale_dependencies import data as DEPENDENCY_DATA
16 from .request_types import *
17
18
19 # Note: for this to be a proper abstract class, it should extend abc.ABC.
20 # There is no nice way to do this that works in both Python 2 and 3.
21 # TODO(ICU-20301): Make this inherit from abc.ABC.
22 class Filter(object):
23 @staticmethod
24 def create_from_json(json_data):
25 if "filterType" in json_data:
26 filter_type = json_data["filterType"]
27 else:
28 filter_type = "file-stem"
29
30 if filter_type == "file-stem":
31 return FileStemFilter(json_data)
32 elif filter_type == "language":
33 return LanguageFilter(json_data)
34 elif filter_type == "regex":
35 return RegexFilter(json_data)
36 elif filter_type == "exclude":
37 return ExclusionFilter()
38 elif filter_type == "union":
39 return UnionFilter(json_data)
40 elif filter_type == "locale":
41 return LocaleFilter(json_data)
42 else:
43 print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr)
44 return None
45
46 def filter(self, request):
47 if not request.apply_file_filter(self):
48 return []
49 for file in request.all_input_files():
50 assert self.match(file)
51 return [request]
52
53 @classmethod
54 def _file_to_file_stem(cls, file):
55 start = file.filename.rfind("/")
56 limit = file.filename.rfind(".")
57 return file.filename[start+1:limit]
58
59 @abstractmethod
60 def match(self, file):
61 pass
62
63
64 class InclusionFilter(Filter):
65 def match(self, file):
66 return True
67
68
69 class ExclusionFilter(Filter):
70 def match(self, file):
71 return False
72
73
74 class WhitelistBlacklistFilter(Filter):
75 def __init__(self, json_data):
76 if "whitelist" in json_data:
77 self.is_whitelist = True
78 self.whitelist = json_data["whitelist"]
79 else:
80 assert "blacklist" in json_data, "Need either whitelist or blacklist: %s" % str(json_data)
81 self.is_whitelist = False
82 self.blacklist = json_data["blacklist"]
83
84 def match(self, file):
85 file_stem = self._file_to_file_stem(file)
86 return self._should_include(file_stem)
87
88 @abstractmethod
89 def _should_include(self, file_stem):
90 pass
91
92
93 class FileStemFilter(WhitelistBlacklistFilter):
94 def _should_include(self, file_stem):
95 if self.is_whitelist:
96 return file_stem in self.whitelist
97 else:
98 return file_stem not in self.blacklist
99
100
101 class LanguageFilter(WhitelistBlacklistFilter):
102 def _should_include(self, file_stem):
103 language = file_stem.split("_")[0]
104 if language == "root":
105 # Always include root.txt
106 return True
107 if self.is_whitelist:
108 return language in self.whitelist
109 else:
110 return language not in self.blacklist
111
112
113 class RegexFilter(WhitelistBlacklistFilter):
114 def __init__(self, *args):
115 # TODO(ICU-20301): Change this to: super().__init__(*args)
116 super(RegexFilter, self).__init__(*args)
117 if self.is_whitelist:
118 self.whitelist = [re.compile(pat) for pat in self.whitelist]
119 else:
120 self.blacklist = [re.compile(pat) for pat in self.blacklist]
121
122 def _should_include(self, file_stem):
123 if self.is_whitelist:
124 for pattern in self.whitelist:
125 if pattern.match(file_stem):
126 return True
127 return False
128 else:
129 for pattern in self.blacklist:
130 if pattern.match(file_stem):
131 return False
132 return True
133
134
135 class UnionFilter(Filter):
136 def __init__(self, json_data):
137 # Collect the sub-filters.
138 self.sub_filters = []
139 for filter_json in json_data["unionOf"]:
140 self.sub_filters.append(Filter.create_from_json(filter_json))
141
142 def match(self, file):
143 """Match iff any of the sub-filters match."""
144 for filter in self.sub_filters:
145 if filter.match(file):
146 return True
147 return False
148
149
150 LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$")
151 LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$")
152
153 class LocaleFilter(Filter):
154 def __init__(self, json_data):
155 self.locales_requested = set()
156 self.locales_required = set()
157 self.include_children = json_data.get("includeChildren", True)
158 self.include_scripts = json_data.get("includeScripts", False)
159
160 # Compute the requested and required locales.
161 for locale in json_data["whitelist"]:
162 self._add_locale_and_parents(locale)
163
164 def _add_locale_and_parents(self, locale):
165 # Store the locale as *requested*
166 self.locales_requested.add(locale)
167 # Store the locale and its dependencies as *required*
168 while locale is not None:
169 self.locales_required.add(locale)
170 locale = self._get_parent_locale(locale)
171
172 def match(self, file):
173 locale = self._file_to_file_stem(file)
174
175 # A locale is *required* if it is *requested* or an ancestor of a
176 # *requested* locale.
177 if locale in self.locales_required:
178 return True
179
180 # Resolve include_scripts and include_children.
181 return self._match_recursive(locale)
182
183 def _match_recursive(self, locale):
184 # Base case: return True if we reached a *requested* locale,
185 # or False if we ascend out of the locale tree.
186 if locale is None:
187 return False
188 if locale in self.locales_requested:
189 return True
190
191 # Check for alternative scripts.
192 # This causes sr_Latn to check sr instead of going directly to root.
193 if self.include_scripts:
194 match = LANGUAGE_SCRIPT_REGEX.match(locale)
195 if match and self._match_recursive(match.group(1)):
196 return True
197
198 # Check if we are a descendant of a *requested* locale.
199 if self.include_children:
200 parent = self._get_parent_locale(locale)
201 if self._match_recursive(parent):
202 return True
203
204 # No matches.
205 return False
206
207 @classmethod
208 def _get_parent_locale(cls, locale):
209 if locale in DEPENDENCY_DATA["parents"]:
210 return DEPENDENCY_DATA["parents"][locale]
211 if locale in DEPENDENCY_DATA["aliases"]:
212 return DEPENDENCY_DATA["aliases"][locale]
213 if LANGUAGE_ONLY_REGEX.match(locale):
214 return "root"
215 i = locale.rfind("_")
216 if i < 0:
217 return None
218 return locale[:i]
219
220
221 def apply_filters(requests, config):
222 """Runs the filters and returns a new list of requests."""
223 requests = _apply_file_filters(requests, config)
224 requests = _apply_resource_filters(requests, config)
225 return requests
226
227
228 def _apply_file_filters(old_requests, config):
229 """Filters out entire files."""
230 filters = _preprocess_file_filters(old_requests, config)
231 new_requests = []
232 for request in old_requests:
233 category = request.category
234 if category in filters:
235 new_requests += filters[category].filter(request)
236 else:
237 new_requests.append(request)
238 return new_requests
239
240
241 def _preprocess_file_filters(requests, config):
242 all_categories = set(
243 request.category
244 for request in requests
245 )
246 all_categories.remove(None)
247 all_categories = list(sorted(all_categories))
248 json_data = config.filters_json_data
249 filters = {}
250 for category in all_categories:
251 if "featureFilters" in json_data and category in json_data["featureFilters"]:
252 filters[category] = Filter.create_from_json(
253 json_data["featureFilters"][category]
254 )
255 elif "localeFilter" in json_data and category[-5:] == "_tree":
256 filters[category] = Filter.create_from_json(
257 json_data["localeFilter"]
258 )
259 if "featureFilters" in json_data:
260 for category in json_data["featureFilters"]:
261 if category not in all_categories:
262 print("Warning: category %s is not known" % category, file=sys.stderr)
263 return filters
264
265
266 class ResourceFilterInfo(object):
267 def __init__(self, category):
268 self.category = category
269 self.filter_tmp_dir = "filters/%s" % category
270 self.input_files = None
271 self.filter_files = None
272 self.rules_by_file = None
273
274 def apply_to_requests(self, all_requests):
275 # Call this method only once per list of requests.
276 assert self.input_files is None
277 for request in all_requests:
278 if request.category != self.category:
279 continue
280 if not isinstance(request, AbstractExecutionRequest):
281 continue
282 if request.tool != IcuTool("genrb"):
283 continue
284 if not request.input_files:
285 continue
286 self._set_files(request.input_files)
287 request.dep_targets += [self.filter_files[:]]
288 arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir
289 request.args = "%s %s" % (arg_str, request.args)
290
291 # Make sure we found the target request
292 if self.input_files is None:
293 print("WARNING: Category not found: %s" % self.category, file=sys.stderr)
294 self.input_files = []
295 self.filter_files = []
296 self.rules_by_file = []
297
298 def _set_files(self, files):
299 # Note: The input files to genrb for a certain category should always
300 # be the same. For example, there are often two genrb calls: one for
301 # --writePoolBundle, and the other for --usePoolBundle. They are both
302 # expected to have the same list of input files.
303 if self.input_files is not None:
304 assert self.input_files == files
305 return
306 self.input_files = list(files)
307 self.filter_files = [
308 TmpFile("%s/%s" % (self.filter_tmp_dir, basename))
309 for basename in (
310 file.filename[file.filename.rfind("/")+1:]
311 for file in files
312 )
313 ]
314 self.rules_by_file = [[] for _ in range(len(files))]
315
316 def add_rules(self, file_filter, rules):
317 for file, rule_list in zip(self.input_files, self.rules_by_file):
318 if file_filter.match(file):
319 rule_list += rules
320
321 def make_requests(self):
322 # Map from rule list to filter files with that rule list
323 unique_rules = defaultdict(list)
324 for filter_file, rules in zip(self.filter_files, self.rules_by_file):
325 unique_rules[tuple(rules)].append(filter_file)
326
327 new_requests = []
328 i = 0
329 for rules, filter_files in unique_rules.items():
330 base_filter_file = filter_files[0]
331 new_requests += [
332 PrintFileRequest(
333 name = "%s_print_%d" % (self.category, i),
334 output_file = base_filter_file,
335 content = self._generate_resource_filter_txt(rules)
336 )
337 ]
338 i += 1
339 for filter_file in filter_files[1:]:
340 new_requests += [
341 CopyRequest(
342 name = "%s_copy_%d" % (self.category, i),
343 input_file = base_filter_file,
344 output_file = filter_file
345 )
346 ]
347 i += 1
348 return new_requests
349
350 @classmethod
351 def _generate_resource_filter_txt(cls, rules):
352 result = "# Caution: This file is automatically generated\n\n"
353 result += "\n".join(rules)
354 return result
355
356
357 def _apply_resource_filters(all_requests, config):
358 """Creates filters for looking within resource bundle files."""
359 json_data = config.filters_json_data
360 if "resourceFilters" not in json_data:
361 return all_requests
362
363 collected = {}
364 for entry in json_data["resourceFilters"]:
365 if "files" in entry:
366 file_filter = Filter.create_from_json(entry["files"])
367 else:
368 file_filter = InclusionFilter()
369 for category in entry["categories"]:
370 # not defaultdict because we need to pass arguments to the constructor
371 if category not in collected:
372 filter_info = ResourceFilterInfo(category)
373 filter_info.apply_to_requests(all_requests)
374 collected[category] = filter_info
375 else:
376 filter_info = collected[category]
377 filter_info.add_rules(file_filter, entry["rules"])
378
379 # Add the filter generation requests to the beginning so that by default
380 # they are made before genrb gets run (order is required by windirect)
381 new_requests = []
382 for filter_info in collected.values():
383 new_requests += filter_info.make_requests()
384 new_requests += all_requests
385 return new_requests