]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/icu-file-utf8-check.py
ICU-66108.tar.gz
[apple/icu.git] / icuSources / tools / icu-file-utf8-check.py
1 #! /usr/bin/python -B
2
3 # Copyright (C) 2016 and later: Unicode, Inc. and others.
4 # License & terms of use: http://www.unicode.org/copyright.html
5
6 # Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
7 # All rights reserved.
8
9 #
10 # Script to check that ICU source files contain only valid UTF-8 encoded text,
11 # and that all files except '.txt' files do not contain a Byte Order Mark (BOM).
12 #
13 # THIS SCRIPT DOES NOT WORK ON WINDOWS
14 # It only works correctly on platforms where the native line ending is a plain \n
15 #
16 # usage:
17 # icu-file-utf8-check.py [options]
18 #
19 # options:
20 # -h | --help Print a usage line and exit.
21 #
22 # The tool operates recursively on the directory from which it is run.
23 # Only files from the ICU github repository are checked.
24 # No changes are made to the repository; only the working copy will be altered.
25
26 from __future__ import print_function
27
28 import sys
29 import os
30 import os.path
31 import re
32 import getopt
33
34
35 def runCommand(cmd):
36 output_file = os.popen(cmd);
37 output_text = output_file.read();
38 exit_status = output_file.close();
39 if exit_status:
40 print('"', cmd, '" failed. Exiting.', file=sys.stderr)
41 sys.exit(exit_status)
42 return output_text
43
44
45 def usage():
46 print("usage: " + sys.argv[0] + " [-h | --help]")
47
48
49 #
50 # File check. Check source code files for UTF-8 and all except text files for not containing a BOM
51 # file_name: name of a text file.
52 # is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java).
53 #
54 def check_file(file_name, is_source):
55 f = open(file_name, 'rb')
56 bytes = f.read()
57 f.close()
58
59 if is_source:
60 try:
61 bytes.decode("UTF-8")
62 except UnicodeDecodeError:
63 print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name)
64
65 if bytes[0] == 0xef:
66 if not (file_name.endswith(".txt") or file_name.endswith(".sln")
67 or file_name.endswith(".targets")
68 or ".vcxproj" in file_name):
69 print("Warning: file %s contains a UTF-8 BOM: " % file_name)
70
71 return
72
73 def main(argv):
74 try:
75 opts, args = getopt.getopt(argv, "h", ("help"))
76 except getopt.GetoptError:
77 print("unrecognized option: " + argv[0])
78 usage()
79 sys.exit(2)
80 for opt, arg in opts:
81 if opt in ("-h", "--help"):
82 usage()
83 sys.exit()
84 if args:
85 print("unexpected command line argument")
86 usage()
87 sys.exit()
88
89 output = runCommand("git ls-files ");
90 file_list = output.splitlines()
91
92 source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))")
93
94 for f in file_list:
95 if os.path.isdir(f):
96 print("Skipping dir " + f)
97 continue
98 if not os.path.isfile(f):
99 print("Repository file not in working copy: " + f)
100 continue;
101
102 source_file = source_file_re.match(f)
103 check_file(f, source_file)
104
105 if __name__ == "__main__":
106 main(sys.argv[1:])