This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: localedef during tests suddenly needs a lot of memory
Mike FABIAN <mfabian@redhat.com> wrote:
> Florian Weimer <fweimer@redhat.com> wrote:
>
>> This started with:
>>
>> commit 7a79e321c6f85b204036c33d85f6b2aa794e7c76
>> Author: Thorsten Glaser <tg@mirbsd.de>
>> Date: Fri Jul 14 14:02:50 2017 +0200
>>
>> Refresh generated charmap data and ChangeLog
>>
>> [BZ #21750]
>> * charmaps/UTF-8: Refresh.
>>
>> Mike is looking at re-adding the range generation support to the Python
>> script, hopefully that should reduce the memory requirements again.
>>
>> Florian
>
> Attached is muy patch to use ranges instead of single code points in the width
> data of charmaps/UTF-8 whereever possible.
To do this, I rewrote most of the code Thorsten Glaser added.
----------------------------------------------------------------------
Here is the diff of my patched utf8_gen.py against Thorsten Glaser’s
version:
$ git diff a3fe6a20bf81ef6a97a761dac9050517e7fd7a1f..4f737628ef23033b8d78e0acead37b2722419822 localedata/unicode-gen/utf8_gen.py
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
index 1563aa11d2..7efae08461 100755
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@@ -229,27 +229,45 @@ def process_width(outfile, ulines, elines):
code_points = fields[0].split("..")
for key in range(int(code_points[0], 16),
int(code_points[1], 16)+1):
- width_dict[key] = unicode_utils.ucs_symbol(key) + '\t2'
+ width_dict[key] = 2
for line in ulines:
fields = line.split(";")
if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
- width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
- int(fields[0], 16)) + '\t0'
+ width_dict[int(fields[0], 16)] = 0
# handle special cases for compatibility
- for key in list(range(0x1160, 0x1200)) + list(range(0x3248, 0x3250)) + \
- list(range(0x4DC0, 0x4E00)) + list((0x00AD,)):
+ for key in list((0x00AD,)):
if key in width_dict:
del width_dict[key]
- width_dict[0x1160] = '{:s}...{:s}\t0'.format(
- unicode_utils.ucs_symbol(0x1160), unicode_utils.ucs_symbol(0x11FF))
- width_dict[0x3248] = '{:s}...{:s}\t2'.format(
- unicode_utils.ucs_symbol(0x3248), unicode_utils.ucs_symbol(0x324F))
- width_dict[0x4DC0] = '{:s}...{:s}\t2'.format(
- unicode_utils.ucs_symbol(0x4DC0), unicode_utils.ucs_symbol(0x4DFF))
+ for key in list(range(0x1160, 0x1200)):
+ width_dict[key] = 0
+ for key in list(range(0x3248, 0x3250)) + list(range(0x4DC0, 0x4E00)):
+ width_dict[key] = 2
+ same_width_lists = []
+ current_width_list = []
for key in sorted(width_dict):
- outfile.write(width_dict[key]+'\n')
+ if not current_width_list:
+ current_width_list = [key]
+ elif (key == current_width_list[-1] + 1
+ and width_dict[key] == width_dict[current_width_list[0]]):
+ current_width_list.append(key)
+ else:
+ same_width_lists.append(current_width_list)
+ current_width_list = [key]
+ if current_width_list:
+ same_width_lists.append(current_width_list)
+
+ for same_width_list in same_width_lists:
+ if len(same_width_list) == 1:
+ outfile.write('{:s}\t{:d}\n'.format(
+ unicode_utils.ucs_symbol(same_width_list[0]),
+ width_dict[same_width_list[0]]))
+ else:
+ outfile.write('{:s}...{:s}\t{:d}\n'.format(
+ unicode_utils.ucs_symbol(same_width_list[0]),
+ unicode_utils.ucs_symbol(same_width_list[-1]),
+ width_dict[same_width_list[0]]))
if __name__ == "__main__":
if len(sys.argv) < 3:
----------------------------------------------------------------------
Here is the diff of my patched utf8_gen.py against a version before
Thorsten Glaser’s patches:
$ git diff bfff8b1becd7d01c074177df7196ab327cd8c844..4f737628ef23033b8d78e0acead37b2722419822 localedata/unicode-gen/utf8_gen.py
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
index ab03e750a6..7efae08461 100755
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@@ -221,31 +221,53 @@ def process_width(outfile, ulines, elines):
'''
width_dict = {}
- for line in ulines:
- fields = line.split(";")
- if fields[4] == "NSM" or fields[2] == "Cf":
- width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
- int(fields[0], 16)) + '\t0'
-
for line in elines:
- # If an entry in EastAsianWidth.txt is found, it overrides entries in
- # UnicodeData.txt:
fields = line.split(";")
if not '..' in fields[0]:
- width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
- int(fields[0], 16)) + '\t2'
+ code_points = (fields[0], fields[0])
else:
code_points = fields[0].split("..")
- for key in range(int(code_points[0], 16),
- int(code_points[1], 16)+1):
- if key in width_dict:
- del width_dict[key]
- width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
- unicode_utils.ucs_symbol(int(code_points[0], 16)),
- unicode_utils.ucs_symbol(int(code_points[1], 16)))
+ for key in range(int(code_points[0], 16),
+ int(code_points[1], 16)+1):
+ width_dict[key] = 2
+ for line in ulines:
+ fields = line.split(";")
+ if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
+ width_dict[int(fields[0], 16)] = 0
+ # handle special cases for compatibility
+ for key in list((0x00AD,)):
+ if key in width_dict:
+ del width_dict[key]
+ for key in list(range(0x1160, 0x1200)):
+ width_dict[key] = 0
+ for key in list(range(0x3248, 0x3250)) + list(range(0x4DC0, 0x4E00)):
+ width_dict[key] = 2
+
+ same_width_lists = []
+ current_width_list = []
for key in sorted(width_dict):
- outfile.write(width_dict[key]+'\n')
+ if not current_width_list:
+ current_width_list = [key]
+ elif (key == current_width_list[-1] + 1
+ and width_dict[key] == width_dict[current_width_list[0]]):
+ current_width_list.append(key)
+ else:
+ same_width_lists.append(current_width_list)
+ current_width_list = [key]
+ if current_width_list:
+ same_width_lists.append(current_width_list)
+
+ for same_width_list in same_width_lists:
+ if len(same_width_list) == 1:
+ outfile.write('{:s}\t{:d}\n'.format(
+ unicode_utils.ucs_symbol(same_width_list[0]),
+ width_dict[same_width_list[0]]))
+ else:
+ outfile.write('{:s}...{:s}\t{:d}\n'.format(
+ unicode_utils.ucs_symbol(same_width_list[0]),
+ unicode_utils.ucs_symbol(same_width_list[-1]),
+ width_dict[same_width_list[0]]))
if __name__ == "__main__":
if len(sys.argv) < 3:
--
Mike FABIAN <mfabian@redhat.com>