]>
Commit | Line | Data |
---|---|---|
3d1f044b A |
1 | # Copyright (C) 2018 and later: Unicode, Inc. and others. |
2 | # License & terms of use: http://www.unicode.org/copyright.html | |
3 | ||
4 | # Python 2/3 Compatibility (ICU-20299) | |
5 | # TODO(ICU-20301): Remove this. | |
6 | from __future__ import print_function | |
7 | ||
8 | from abc import abstractmethod | |
9 | from collections import defaultdict | |
10 | import re | |
11 | import sys | |
12 | ||
13 | from . import * | |
14 | from . import utils | |
15 | from .locale_dependencies import data as DEPENDENCY_DATA | |
16 | from .request_types import * | |
17 | ||
18 | ||
19 | # Note: for this to be a proper abstract class, it should extend abc.ABC. | |
20 | # There is no nice way to do this that works in both Python 2 and 3. | |
21 | # TODO(ICU-20301): Make this inherit from abc.ABC. | |
22 | class Filter(object): | |
23 | @staticmethod | |
24 | def create_from_json(json_data): | |
25 | if "filterType" in json_data: | |
26 | filter_type = json_data["filterType"] | |
27 | else: | |
28 | filter_type = "file-stem" | |
29 | ||
30 | if filter_type == "file-stem": | |
31 | return FileStemFilter(json_data) | |
32 | elif filter_type == "language": | |
33 | return LanguageFilter(json_data) | |
34 | elif filter_type == "regex": | |
35 | return RegexFilter(json_data) | |
36 | elif filter_type == "exclude": | |
37 | return ExclusionFilter() | |
38 | elif filter_type == "union": | |
39 | return UnionFilter(json_data) | |
40 | elif filter_type == "locale": | |
41 | return LocaleFilter(json_data) | |
42 | else: | |
43 | print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr) | |
44 | return None | |
45 | ||
46 | def filter(self, request): | |
47 | if not request.apply_file_filter(self): | |
48 | return [] | |
49 | for file in request.all_input_files(): | |
50 | assert self.match(file) | |
51 | return [request] | |
52 | ||
53 | @classmethod | |
54 | def _file_to_file_stem(cls, file): | |
55 | start = file.filename.rfind("/") | |
56 | limit = file.filename.rfind(".") | |
57 | return file.filename[start+1:limit] | |
58 | ||
59 | @abstractmethod | |
60 | def match(self, file): | |
61 | pass | |
62 | ||
63 | ||
64 | class InclusionFilter(Filter): | |
65 | def match(self, file): | |
66 | return True | |
67 | ||
68 | ||
69 | class ExclusionFilter(Filter): | |
70 | def match(self, file): | |
71 | return False | |
72 | ||
73 | ||
74 | class WhitelistBlacklistFilter(Filter): | |
75 | def __init__(self, json_data): | |
76 | if "whitelist" in json_data: | |
77 | self.is_whitelist = True | |
78 | self.whitelist = json_data["whitelist"] | |
79 | else: | |
80 | assert "blacklist" in json_data, "Need either whitelist or blacklist: %s" % str(json_data) | |
81 | self.is_whitelist = False | |
82 | self.blacklist = json_data["blacklist"] | |
83 | ||
84 | def match(self, file): | |
85 | file_stem = self._file_to_file_stem(file) | |
86 | return self._should_include(file_stem) | |
87 | ||
88 | @abstractmethod | |
89 | def _should_include(self, file_stem): | |
90 | pass | |
91 | ||
92 | ||
93 | class FileStemFilter(WhitelistBlacklistFilter): | |
94 | def _should_include(self, file_stem): | |
95 | if self.is_whitelist: | |
96 | return file_stem in self.whitelist | |
97 | else: | |
98 | return file_stem not in self.blacklist | |
99 | ||
100 | ||
101 | class LanguageFilter(WhitelistBlacklistFilter): | |
102 | def _should_include(self, file_stem): | |
103 | language = file_stem.split("_")[0] | |
104 | if language == "root": | |
105 | # Always include root.txt | |
106 | return True | |
107 | if self.is_whitelist: | |
108 | return language in self.whitelist | |
109 | else: | |
110 | return language not in self.blacklist | |
111 | ||
112 | ||
113 | class RegexFilter(WhitelistBlacklistFilter): | |
114 | def __init__(self, *args): | |
115 | # TODO(ICU-20301): Change this to: super().__init__(*args) | |
116 | super(RegexFilter, self).__init__(*args) | |
117 | if self.is_whitelist: | |
118 | self.whitelist = [re.compile(pat) for pat in self.whitelist] | |
119 | else: | |
120 | self.blacklist = [re.compile(pat) for pat in self.blacklist] | |
121 | ||
122 | def _should_include(self, file_stem): | |
123 | if self.is_whitelist: | |
124 | for pattern in self.whitelist: | |
125 | if pattern.match(file_stem): | |
126 | return True | |
127 | return False | |
128 | else: | |
129 | for pattern in self.blacklist: | |
130 | if pattern.match(file_stem): | |
131 | return False | |
132 | return True | |
133 | ||
134 | ||
135 | class UnionFilter(Filter): | |
136 | def __init__(self, json_data): | |
137 | # Collect the sub-filters. | |
138 | self.sub_filters = [] | |
139 | for filter_json in json_data["unionOf"]: | |
140 | self.sub_filters.append(Filter.create_from_json(filter_json)) | |
141 | ||
142 | def match(self, file): | |
143 | """Match iff any of the sub-filters match.""" | |
144 | for filter in self.sub_filters: | |
145 | if filter.match(file): | |
146 | return True | |
147 | return False | |
148 | ||
149 | ||
150 | LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$") | |
151 | LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$") | |
152 | ||
153 | class LocaleFilter(Filter): | |
154 | def __init__(self, json_data): | |
155 | self.locales_requested = set() | |
156 | self.locales_required = set() | |
157 | self.include_children = json_data.get("includeChildren", True) | |
158 | self.include_scripts = json_data.get("includeScripts", False) | |
159 | ||
160 | # Compute the requested and required locales. | |
161 | for locale in json_data["whitelist"]: | |
162 | self._add_locale_and_parents(locale) | |
163 | ||
164 | def _add_locale_and_parents(self, locale): | |
165 | # Store the locale as *requested* | |
166 | self.locales_requested.add(locale) | |
167 | # Store the locale and its dependencies as *required* | |
168 | while locale is not None: | |
169 | self.locales_required.add(locale) | |
170 | locale = self._get_parent_locale(locale) | |
171 | ||
172 | def match(self, file): | |
173 | locale = self._file_to_file_stem(file) | |
174 | ||
175 | # A locale is *required* if it is *requested* or an ancestor of a | |
176 | # *requested* locale. | |
177 | if locale in self.locales_required: | |
178 | return True | |
179 | ||
180 | # Resolve include_scripts and include_children. | |
181 | return self._match_recursive(locale) | |
182 | ||
183 | def _match_recursive(self, locale): | |
184 | # Base case: return True if we reached a *requested* locale, | |
185 | # or False if we ascend out of the locale tree. | |
186 | if locale is None: | |
187 | return False | |
188 | if locale in self.locales_requested: | |
189 | return True | |
190 | ||
191 | # Check for alternative scripts. | |
192 | # This causes sr_Latn to check sr instead of going directly to root. | |
193 | if self.include_scripts: | |
194 | match = LANGUAGE_SCRIPT_REGEX.match(locale) | |
195 | if match and self._match_recursive(match.group(1)): | |
196 | return True | |
197 | ||
198 | # Check if we are a descendant of a *requested* locale. | |
199 | if self.include_children: | |
200 | parent = self._get_parent_locale(locale) | |
201 | if self._match_recursive(parent): | |
202 | return True | |
203 | ||
204 | # No matches. | |
205 | return False | |
206 | ||
207 | @classmethod | |
208 | def _get_parent_locale(cls, locale): | |
209 | if locale in DEPENDENCY_DATA["parents"]: | |
210 | return DEPENDENCY_DATA["parents"][locale] | |
211 | if locale in DEPENDENCY_DATA["aliases"]: | |
212 | return DEPENDENCY_DATA["aliases"][locale] | |
213 | if LANGUAGE_ONLY_REGEX.match(locale): | |
214 | return "root" | |
215 | i = locale.rfind("_") | |
216 | if i < 0: | |
217 | return None | |
218 | return locale[:i] | |
219 | ||
220 | ||
221 | def apply_filters(requests, config): | |
222 | """Runs the filters and returns a new list of requests.""" | |
223 | requests = _apply_file_filters(requests, config) | |
224 | requests = _apply_resource_filters(requests, config) | |
225 | return requests | |
226 | ||
227 | ||
228 | def _apply_file_filters(old_requests, config): | |
229 | """Filters out entire files.""" | |
230 | filters = _preprocess_file_filters(old_requests, config) | |
231 | new_requests = [] | |
232 | for request in old_requests: | |
233 | category = request.category | |
234 | if category in filters: | |
235 | new_requests += filters[category].filter(request) | |
236 | else: | |
237 | new_requests.append(request) | |
238 | return new_requests | |
239 | ||
240 | ||
241 | def _preprocess_file_filters(requests, config): | |
242 | all_categories = set( | |
243 | request.category | |
244 | for request in requests | |
245 | ) | |
246 | all_categories.remove(None) | |
247 | all_categories = list(sorted(all_categories)) | |
248 | json_data = config.filters_json_data | |
249 | filters = {} | |
250 | for category in all_categories: | |
251 | if "featureFilters" in json_data and category in json_data["featureFilters"]: | |
252 | filters[category] = Filter.create_from_json( | |
253 | json_data["featureFilters"][category] | |
254 | ) | |
255 | elif "localeFilter" in json_data and category[-5:] == "_tree": | |
256 | filters[category] = Filter.create_from_json( | |
257 | json_data["localeFilter"] | |
258 | ) | |
259 | if "featureFilters" in json_data: | |
260 | for category in json_data["featureFilters"]: | |
261 | if category not in all_categories: | |
262 | print("Warning: category %s is not known" % category, file=sys.stderr) | |
263 | return filters | |
264 | ||
265 | ||
266 | class ResourceFilterInfo(object): | |
267 | def __init__(self, category): | |
268 | self.category = category | |
269 | self.filter_tmp_dir = "filters/%s" % category | |
270 | self.input_files = None | |
271 | self.filter_files = None | |
272 | self.rules_by_file = None | |
273 | ||
274 | def apply_to_requests(self, all_requests): | |
275 | # Call this method only once per list of requests. | |
276 | assert self.input_files is None | |
277 | for request in all_requests: | |
278 | if request.category != self.category: | |
279 | continue | |
280 | if not isinstance(request, AbstractExecutionRequest): | |
281 | continue | |
282 | if request.tool != IcuTool("genrb"): | |
283 | continue | |
284 | if not request.input_files: | |
285 | continue | |
286 | self._set_files(request.input_files) | |
287 | request.dep_targets += [self.filter_files[:]] | |
288 | arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir | |
289 | request.args = "%s %s" % (arg_str, request.args) | |
290 | ||
291 | # Make sure we found the target request | |
292 | if self.input_files is None: | |
293 | print("WARNING: Category not found: %s" % self.category, file=sys.stderr) | |
294 | self.input_files = [] | |
295 | self.filter_files = [] | |
296 | self.rules_by_file = [] | |
297 | ||
298 | def _set_files(self, files): | |
299 | # Note: The input files to genrb for a certain category should always | |
300 | # be the same. For example, there are often two genrb calls: one for | |
301 | # --writePoolBundle, and the other for --usePoolBundle. They are both | |
302 | # expected to have the same list of input files. | |
303 | if self.input_files is not None: | |
304 | assert self.input_files == files | |
305 | return | |
306 | self.input_files = list(files) | |
307 | self.filter_files = [ | |
308 | TmpFile("%s/%s" % (self.filter_tmp_dir, basename)) | |
309 | for basename in ( | |
310 | file.filename[file.filename.rfind("/")+1:] | |
311 | for file in files | |
312 | ) | |
313 | ] | |
314 | self.rules_by_file = [[] for _ in range(len(files))] | |
315 | ||
316 | def add_rules(self, file_filter, rules): | |
317 | for file, rule_list in zip(self.input_files, self.rules_by_file): | |
318 | if file_filter.match(file): | |
319 | rule_list += rules | |
320 | ||
321 | def make_requests(self): | |
322 | # Map from rule list to filter files with that rule list | |
323 | unique_rules = defaultdict(list) | |
324 | for filter_file, rules in zip(self.filter_files, self.rules_by_file): | |
325 | unique_rules[tuple(rules)].append(filter_file) | |
326 | ||
327 | new_requests = [] | |
328 | i = 0 | |
329 | for rules, filter_files in unique_rules.items(): | |
330 | base_filter_file = filter_files[0] | |
331 | new_requests += [ | |
332 | PrintFileRequest( | |
333 | name = "%s_print_%d" % (self.category, i), | |
334 | output_file = base_filter_file, | |
335 | content = self._generate_resource_filter_txt(rules) | |
336 | ) | |
337 | ] | |
338 | i += 1 | |
339 | for filter_file in filter_files[1:]: | |
340 | new_requests += [ | |
341 | CopyRequest( | |
342 | name = "%s_copy_%d" % (self.category, i), | |
343 | input_file = base_filter_file, | |
344 | output_file = filter_file | |
345 | ) | |
346 | ] | |
347 | i += 1 | |
348 | return new_requests | |
349 | ||
350 | @classmethod | |
351 | def _generate_resource_filter_txt(cls, rules): | |
352 | result = "# Caution: This file is automatically generated\n\n" | |
353 | result += "\n".join(rules) | |
354 | return result | |
355 | ||
356 | ||
357 | def _apply_resource_filters(all_requests, config): | |
358 | """Creates filters for looking within resource bundle files.""" | |
359 | json_data = config.filters_json_data | |
360 | if "resourceFilters" not in json_data: | |
361 | return all_requests | |
362 | ||
363 | collected = {} | |
364 | for entry in json_data["resourceFilters"]: | |
365 | if "files" in entry: | |
366 | file_filter = Filter.create_from_json(entry["files"]) | |
367 | else: | |
368 | file_filter = InclusionFilter() | |
369 | for category in entry["categories"]: | |
370 | # not defaultdict because we need to pass arguments to the constructor | |
371 | if category not in collected: | |
372 | filter_info = ResourceFilterInfo(category) | |
373 | filter_info.apply_to_requests(all_requests) | |
374 | collected[category] = filter_info | |
375 | else: | |
376 | filter_info = collected[category] | |
377 | filter_info.add_rules(file_filter, entry["rules"]) | |
378 | ||
379 | # Add the filter generation requests to the beginning so that by default | |
380 | # they are made before genrb gets run (order is required by windirect) | |
381 | new_requests = [] | |
382 | for filter_info in collected.values(): | |
383 | new_requests += filter_info.make_requests() | |
384 | new_requests += all_requests | |
385 | return new_requests |