]>
git.saurik.com Git - apple/icu.git/blob - icuSources/python/icutools/databuilder/filtration.py
1 # Copyright (C) 2018 and later: Unicode, Inc. and others.
2 # License & terms of use: http://www.unicode.org/copyright.html
4 # Python 2/3 Compatibility (ICU-20299)
5 # TODO(ICU-20301): Remove this.
6 from __future__
import print_function
8 from abc
import abstractmethod
9 from collections
import defaultdict
15 from .request_types
import *
18 # Note: for this to be a proper abstract class, it should extend abc.ABC.
19 # There is no nice way to do this that works in both Python 2 and 3.
20 # TODO(ICU-20301): Make this inherit from abc.ABC.
23 def create_from_json(json_data
, io
):
25 if "filterType" in json_data
:
26 filter_type
= json_data
["filterType"]
28 filter_type
= "file-stem"
30 if filter_type
== "file-stem":
31 return FileStemFilter(json_data
)
32 elif filter_type
== "language":
33 return LanguageFilter(json_data
)
34 elif filter_type
== "regex":
35 return RegexFilter(json_data
)
36 elif filter_type
== "exclude":
37 return ExclusionFilter()
38 elif filter_type
== "union":
39 return UnionFilter(json_data
, io
)
40 elif filter_type
== "locale":
41 return LocaleFilter(json_data
, io
)
43 print("Error: Unknown filterType option: %s" % filter_type
, file=sys
.stderr
)
46 def filter(self
, request
):
47 if not request
.apply_file_filter(self
):
49 for file in request
.all_input_files():
50 assert self
.match(file)
54 def _file_to_file_stem(file):
55 start
= file.filename
.rfind("/")
56 limit
= file.filename
.rfind(".")
57 return file.filename
[start
+1:limit
]
60 def _file_to_subdir(file):
61 limit
= file.filename
.rfind("/")
64 return file.filename
[:limit
]
67 def match(self
, file):
71 class InclusionFilter(Filter
):
72 def match(self
, file):
76 class ExclusionFilter(Filter
):
77 def match(self
, file):
81 class WhitelistBlacklistFilter(Filter
):
82 def __init__(self
, json_data
):
83 if "whitelist" in json_data
:
84 self
.is_whitelist
= True
85 self
.whitelist
= json_data
["whitelist"]
87 assert "blacklist" in json_data
, "Need either whitelist or blacklist: %s" % str(json_data
)
88 self
.is_whitelist
= False
89 self
.blacklist
= json_data
["blacklist"]
91 def match(self
, file):
92 file_stem
= self
._file
_to
_file
_stem
(file)
93 return self
._should
_include
(file_stem
)
96 def _should_include(self
, file_stem
):
100 class FileStemFilter(WhitelistBlacklistFilter
):
101 def _should_include(self
, file_stem
):
102 if self
.is_whitelist
:
103 return file_stem
in self
.whitelist
105 return file_stem
not in self
.blacklist
108 class LanguageFilter(WhitelistBlacklistFilter
):
109 def _should_include(self
, file_stem
):
110 language
= file_stem
.split("_")[0]
111 if language
== "root":
112 # Always include root.txt
114 if self
.is_whitelist
:
115 return language
in self
.whitelist
117 return language
not in self
.blacklist
120 class RegexFilter(WhitelistBlacklistFilter
):
121 def __init__(self
, *args
):
122 # TODO(ICU-20301): Change this to: super().__init__(*args)
123 super(RegexFilter
, self
).__init
__(*args
)
124 if self
.is_whitelist
:
125 self
.whitelist
= [re
.compile(pat
) for pat
in self
.whitelist
]
127 self
.blacklist
= [re
.compile(pat
) for pat
in self
.blacklist
]
129 def _should_include(self
, file_stem
):
130 if self
.is_whitelist
:
131 for pattern
in self
.whitelist
:
132 if pattern
.match(file_stem
):
136 for pattern
in self
.blacklist
:
137 if pattern
.match(file_stem
):
142 class UnionFilter(Filter
):
143 def __init__(self
, json_data
, io
):
144 # Collect the sub-filters.
145 self
.sub_filters
= []
146 for filter_json
in json_data
["unionOf"]:
147 self
.sub_filters
.append(Filter
.create_from_json(filter_json
, io
))
149 def match(self
, file):
150 """Match iff any of the sub-filters match."""
151 for filter in self
.sub_filters
:
152 if filter.match(file):
157 LANGUAGE_SCRIPT_REGEX
= re
.compile(r
"^([a-z]{2,3})_[A-Z][a-z]{3}$")
158 LANGUAGE_ONLY_REGEX
= re
.compile(r
"^[a-z]{2,3}$")
160 class LocaleFilter(Filter
):
161 def __init__(self
, json_data
, io
):
162 self
.locales_requested
= list(json_data
["whitelist"])
163 self
.include_children
= json_data
.get("includeChildren", True)
164 self
.include_scripts
= json_data
.get("includeScripts", False)
166 # Load the dependency graph from disk
167 self
.dependency_data_by_tree
= {
168 tree
: io
.read_locale_deps(tree
)
169 for tree
in utils
.ALL_TREES
172 def match(self
, file):
173 tree
= self
._file
_to
_subdir
(file)
174 assert tree
is not None
175 locale
= self
._file
_to
_file
_stem
(file)
177 # A locale is *required* if it is *requested* or an ancestor of a
178 # *requested* locale.
179 if locale
in self
._locales
_required
(tree
):
182 # Resolve include_scripts and include_children.
183 return self
._match
_recursive
(locale
, tree
)
185 def _match_recursive(self
, locale
, tree
):
186 # Base case: return True if we reached a *requested* locale,
187 # or False if we ascend out of the locale tree.
190 if locale
in self
.locales_requested
:
193 # Check for alternative scripts.
194 # This causes sr_Latn to check sr instead of going directly to root.
195 if self
.include_scripts
:
196 match
= LANGUAGE_SCRIPT_REGEX
.match(locale
)
197 if match
and self
._match
_recursive
(match
.group(1), tree
):
200 # Check if we are a descendant of a *requested* locale.
201 if self
.include_children
:
202 parent
= self
._get
_parent
_locale
(locale
, tree
)
203 if self
._match
_recursive
(parent
, tree
):
209 def _get_parent_locale(self
, locale
, tree
):
210 """Gets the parent locale in the given tree, according to dependency data."""
211 dependency_data
= self
.dependency_data_by_tree
[tree
]
212 if "parents" in dependency_data
and locale
in dependency_data
["parents"]:
213 return dependency_data
["parents"][locale
]
214 if "aliases" in dependency_data
and locale
in dependency_data
["aliases"]:
215 return dependency_data
["aliases"][locale
]
216 if LANGUAGE_ONLY_REGEX
.match(locale
):
218 i
= locale
.rfind("_")
220 assert locale
== "root"
224 def _locales_required(self
, tree
):
225 """Returns a generator of all required locales in the given tree."""
226 for locale
in self
.locales_requested
:
227 while locale
is not None:
229 locale
= self
._get
_parent
_locale
(locale
, tree
)
232 def apply_filters(requests
, config
, io
):
233 """Runs the filters and returns a new list of requests."""
234 requests
= _apply_file_filters(requests
, config
, io
)
235 requests
= _apply_resource_filters(requests
, config
, io
)
239 def _apply_file_filters(old_requests
, config
, io
):
240 """Filters out entire files."""
241 filters
= _preprocess_file_filters(old_requests
, config
, io
)
243 for request
in old_requests
:
244 category
= request
.category
245 if category
in filters
:
246 new_requests
+= filters
[category
].filter(request
)
248 new_requests
.append(request
)
252 def _preprocess_file_filters(requests
, config
, io
):
253 all_categories
= set(
255 for request
in requests
257 all_categories
.remove(None)
258 all_categories
= list(sorted(all_categories
))
259 json_data
= config
.filters_json_data
261 default_filter_json
= "exclude" if config
.strategy
== "additive" else "include"
262 for category
in all_categories
:
263 filter_json
= default_filter_json
264 # Figure out the correct filter to create
265 if "featureFilters" in json_data
and category
in json_data
["featureFilters"]:
266 filter_json
= json_data
["featureFilters"][category
]
267 if filter_json
== "include" and "localeFilter" in json_data
and category
.endswith("_tree"):
268 filter_json
= json_data
["localeFilter"]
269 # Resolve the filter JSON into a filter object
270 if filter_json
== "exclude":
271 filters
[category
] = ExclusionFilter()
272 elif filter_json
== "include":
275 filters
[category
] = Filter
.create_from_json(filter_json
, io
)
276 if "featureFilters" in json_data
:
277 for category
in json_data
["featureFilters"]:
278 if category
not in all_categories
:
279 print("Warning: category %s is not known" % category
, file=sys
.stderr
)
283 class ResourceFilterInfo(object):
284 def __init__(self
, category
, strategy
):
285 self
.category
= category
286 self
.strategy
= strategy
287 self
.filter_tmp_dir
= "filters/%s" % category
288 self
.input_files
= None
289 self
.filter_files
= None
290 self
.rules_by_file
= None
292 def apply_to_requests(self
, all_requests
):
293 # Call this method only once per list of requests.
294 assert self
.input_files
is None
295 for request
in all_requests
:
296 if request
.category
!= self
.category
:
298 if not isinstance(request
, AbstractExecutionRequest
):
300 if request
.tool
!= IcuTool("genrb"):
302 if not request
.input_files
:
304 self
._set
_files
(request
.input_files
)
305 request
.dep_targets
+= [self
.filter_files
[:]]
306 arg_str
= "--filterDir {TMP_DIR}/%s" % self
.filter_tmp_dir
307 request
.args
= "%s %s" % (arg_str
, request
.args
)
309 # Make sure we found the target request
310 if self
.input_files
is None:
311 print("WARNING: Category not found: %s" % self
.category
, file=sys
.stderr
)
312 self
.input_files
= []
313 self
.filter_files
= []
314 self
.rules_by_file
= []
316 def _set_files(self
, files
):
317 # Note: The input files to genrb for a certain category should always
318 # be the same. For example, there are often two genrb calls: one for
319 # --writePoolBundle, and the other for --usePoolBundle. They are both
320 # expected to have the same list of input files.
321 if self
.input_files
is not None:
322 assert self
.input_files
== files
324 self
.input_files
= list(files
)
325 self
.filter_files
= [
326 TmpFile("%s/%s" % (self
.filter_tmp_dir
, basename
))
328 file.filename
[file.filename
.rfind("/")+1:]
332 if self
.strategy
== "additive":
333 self
.rules_by_file
= [
334 [r
"-/", r
"+/%%ALIAS", r
"+/%%Parent"]
335 for _
in range(len(files
))
338 self
.rules_by_file
= [
340 for _
in range(len(files
))
343 def add_rules(self
, file_filter
, rules
):
344 for file, rule_list
in zip(self
.input_files
, self
.rules_by_file
):
345 if file_filter
.match(file):
348 def make_requests(self
):
349 # Map from rule list to filter files with that rule list
350 unique_rules
= defaultdict(list)
351 for filter_file
, rules
in zip(self
.filter_files
, self
.rules_by_file
):
352 unique_rules
[tuple(rules
)].append(filter_file
)
356 for rules
, filter_files
in unique_rules
.items():
357 base_filter_file
= filter_files
[0]
360 name
= "%s_print_%d" % (self
.category
, i
),
361 output_file
= base_filter_file
,
362 content
= self
._generate
_resource
_filter
_txt
(rules
)
366 for filter_file
in filter_files
[1:]:
369 name
= "%s_copy_%d" % (self
.category
, i
),
370 input_file
= base_filter_file
,
371 output_file
= filter_file
378 def _generate_resource_filter_txt(rules
):
379 result
= "# Caution: This file is automatically generated\n\n"
380 result
+= "\n".join(rules
)
384 def _apply_resource_filters(all_requests
, config
, io
):
385 """Creates filters for looking within resource bundle files."""
386 json_data
= config
.filters_json_data
387 if "resourceFilters" not in json_data
:
391 for entry
in json_data
["resourceFilters"]:
393 file_filter
= Filter
.create_from_json(entry
["files"], io
)
395 file_filter
= InclusionFilter()
396 for category
in entry
["categories"]:
397 # not defaultdict because we need to pass arguments to the constructor
398 if category
not in collected
:
399 filter_info
= ResourceFilterInfo(category
, config
.strategy
)
400 filter_info
.apply_to_requests(all_requests
)
401 collected
[category
] = filter_info
403 filter_info
= collected
[category
]
404 filter_info
.add_rules(file_filter
, entry
["rules"])
406 # Add the filter generation requests to the beginning so that by default
407 # they are made before genrb gets run (order is required by windirect)
409 for filter_info
in collected
.values():
410 new_requests
+= filter_info
.make_requests()
411 new_requests
+= all_requests