]>
Commit | Line | Data |
---|---|---|
3d1f044b A |
1 | #! /usr/bin/python -B |
2 | ||
3 | # Copyright (C) 2016 and later: Unicode, Inc. and others. | |
4 | # License & terms of use: http://www.unicode.org/copyright.html | |
5 | ||
6 | # Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others. | |
7 | # All rights reserved. | |
8 | ||
9 | # | |
10 | # Script to check that ICU source files contain only valid UTF-8 encoded text, | |
11 | # and that all files except '.txt' files do not contain a Byte Order Mark (BOM). | |
12 | # | |
13 | # THIS SCRIPT DOES NOT WORK ON WINDOWS | |
14 | # It only works correctly on platforms where the native line ending is a plain \n | |
15 | # | |
16 | # usage: | |
17 | # icu-file-utf8-check.py [options] | |
18 | # | |
19 | # options: | |
20 | # -h | --help Print a usage line and exit. | |
21 | # | |
22 | # The tool operates recursively on the directory from which it is run. | |
23 | # Only files from the ICU github repository are checked. | |
24 | # No changes are made to the repository; only the working copy will be altered. | |
25 | ||
26 | from __future__ import print_function | |
27 | ||
28 | import sys | |
29 | import os | |
30 | import os.path | |
31 | import re | |
32 | import getopt | |
33 | ||
34 | ||
35 | def runCommand(cmd): | |
36 | output_file = os.popen(cmd); | |
37 | output_text = output_file.read(); | |
38 | exit_status = output_file.close(); | |
39 | if exit_status: | |
40 | print('"', cmd, '" failed. Exiting.', file=sys.stderr) | |
41 | sys.exit(exit_status) | |
42 | return output_text | |
43 | ||
44 | ||
45 | def usage(): | |
46 | print("usage: " + sys.argv[0] + " [-h | --help]") | |
47 | ||
48 | ||
49 | # | |
50 | # File check. Check source code files for UTF-8 and all except text files for not containing a BOM | |
51 | # file_name: name of a text file. | |
52 | # is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java). | |
53 | # | |
54 | def check_file(file_name, is_source): | |
55 | f = open(file_name, 'rb') | |
56 | bytes = f.read() | |
57 | f.close() | |
58 | ||
59 | if is_source: | |
60 | try: | |
61 | bytes.decode("UTF-8") | |
62 | except UnicodeDecodeError: | |
63 | print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name) | |
64 | ||
65 | if bytes[0] == 0xef: | |
66 | if not (file_name.endswith(".txt") or file_name.endswith(".sln") | |
67 | or file_name.endswith(".targets") | |
68 | or ".vcxproj" in file_name): | |
69 | print("Warning: file %s contains a UTF-8 BOM: " % file_name) | |
70 | ||
71 | return | |
72 | ||
73 | def main(argv): | |
74 | try: | |
75 | opts, args = getopt.getopt(argv, "h", ("help")) | |
76 | except getopt.GetoptError: | |
77 | print("unrecognized option: " + argv[0]) | |
78 | usage() | |
79 | sys.exit(2) | |
80 | for opt, arg in opts: | |
81 | if opt in ("-h", "--help"): | |
82 | usage() | |
83 | sys.exit() | |
84 | if args: | |
85 | print("unexpected command line argument") | |
86 | usage() | |
87 | sys.exit() | |
88 | ||
89 | output = runCommand("git ls-files "); | |
90 | file_list = output.splitlines() | |
91 | ||
92 | source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))") | |
93 | ||
94 | for f in file_list: | |
95 | if os.path.isdir(f): | |
96 | print("Skipping dir " + f) | |
97 | continue | |
98 | if not os.path.isfile(f): | |
99 | print("Repository file not in working copy: " + f) | |
100 | continue; | |
101 | ||
102 | source_file = source_file_re.match(f) | |
103 | check_file(f, source_file) | |
104 | ||
105 | if __name__ == "__main__": | |
106 | main(sys.argv[1:]) |