This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
On 09 Feb 2016 09:25, Marko Myllynen wrote: > On 2016-02-09 09:20, Mike Frysinger wrote: > > i need to respin this patch due to some of the comments falling out of > > date. should i send another patch to just delete these things ? e.g. > > files inconstantly do things like: > > % wa > > lang_ab "<U0077><U0061>" > > > > i think we should just scrub these and rely on a simple filter script > > to view the content. > > This series is a great initiative, thanks for doing this! I wonder how > have you generated the patches, would it be possible to script any of > this so that in case CLDR changes in the future we could automatically > update glibc locales accordingly then? yes, i've written a python file that downloads the CLDR db, loads the locale files, and then merges them and checks for differences. i'm in the process of trying to extract as much as possible from the CLDR db. you can find my in-progress work attached, but i'm not posting it for review yet ;). i just run it like: $ ./cldr.py locales/* -mike
#!/usr/bin/python # -*- coding: utf-8 -*- # Written by Mike Frysinger <vapier@gentoo.org> for much great glory. """Helper tool for importing current CLDR data. See http://cldr.unicode.org/ for more details.""" from __future__ import print_function import argparse import datetime import errno import itertools import logging import os import re import subprocess import sys import time from xml.etree import ElementTree # The current release version that we use. CLDR_VERSION = '28' # Where to find the CLDR data. CLDR_URI = 'http://unicode.org/Public/cldr/%(version)s/core.zip' # Where to store CLDR data files we fetch. DEFAULT_WORKING_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'cldr-%(version)s') # Whether we should clean up newlines/comments. REWRITE_STYLE = False def u_encode(text): """Convert unicode |text| to <U####> format.""" return ''.join('<U%04X>' % ord(x) for x in text) def u_decode(text): """Convert <U####> format in |text|.""" def unirep(match): return chr(int(match.group(1), 16)) conv = re.compile(r'<U([0-9A-Fa-f]+)>') return conv.sub(unirep, text) def get_parser(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--working-dir', default=DEFAULT_WORKING_DIR, help='Where to download CLDR files (default: %(default)s)') parser.add_argument('-v', '--version', default=CLDR_VERSION, help='Version of CLDR to use (default: %(default)s)') parser.add_argument('locales', nargs='*', help='Locales to generate') return parser def logging_init(debug=False): """Set up the logging module.""" fmt = '%(asctime)s: %(levelname)-7s: ' fmt += '%(message)s' # 'Sat, 05 Oct 2013 18:58:50 -0400 (EST)' tzname = time.strftime('%Z', time.localtime()) datefmt = '%a, %d %b %Y %H:%M:%S ' + tzname level = logging.DEBUG if debug else logging.INFO handler = logging.StreamHandler(stream=sys.stdout) formatter = logging.Formatter(fmt, datefmt) handler.setFormatter(formatter) logger = logging.getLogger() logger.addHandler(handler) logger.setLevel(level) class LocaleError(Exception): """Error w/Locale objects""" class LocaleCategory(object): """Content for a single locale category.""" def __init__(self, name='', content=(), header=()): self.name = name.lower() self.content = content self.header = header def __str__(self): padding = '\n' if REWRITE_STYLE else '' ret = '' if self.header: ret += padding + '\n'.join(self.header) + '\n' NAME = self.name.upper() ret += padding + '\n'.join([NAME] + self.content + ['END %s' % NAME]) + '\n' return ret class Locale(object): """Content for a locale file itself.""" def __init__(self, name=None, path=None): self.name = name self.header = [] self.lc_identification = [] self.lc_ctype = [] self.lc_collate = [] self.lc_time = [] self.lc_numeric = [] self.lc_monetary = [] self.lc_messages = [] self.lc_paper = [] self.lc_name = [] self.lc_address = [] self.lc_telephone = [] self.lc_measurement = [] self.categories = [] self.cldr = None if path is not None: self.read(path) def readfp(self, fp): """Load the locale content from |fp|""" def _trim_extra_lines(lines, leading=True, trailing=True, consecutive=True, comments=False): if not REWRITE_STYLE: return lines # Clear leading blank lines. if leading: while lines and not lines[0]: lines.pop(0) # Clear trailing blank lines. if trailing: while lines and not lines[-1]: lines.pop(-1) # Clear consecutive blank lines. if consecutive: i = 0 while i < len(lines) - 1: if not lines[i] and not lines[i + 1]: lines.pop(i) else: i += 1 # Trim blank comment lines that start/end a section. if comments: i = 0 while i < len(lines): if (lines[i] == '%' and (i == 0 or not lines[i - 1] or lines[i - 1][0] != '%')): lines.pop(i) elif (lines[i] == '%' and (i == len(lines) - 1 or not lines[i + 1] or lines[i + 1][0] != '%')): lines.pop(i) else: i += 1 return lines lines = [x.rstrip() for x in fp.readlines()] _trim_extra_lines(lines) # Process the leading few lines. comment_line = lines.pop(0) escape_line = lines.pop(0) self.header = [comment_line, escape_line] if escape_line.startswith('comment_char'): escape_line, comment_line = comment_line, escape_line line = comment_line if line.startswith('comment_char'): if line.split()[1] != '%': raise LocaleError('Bad comment_char: %s' % line) else: raise LocaleError('Second line should be comment_char, not %s' % line) line = escape_line if line.startswith('escape_char'): if line.split()[1] != '/': raise LocaleError('Bad escape_char: %s' % line) else: raise LocaleError('First line should be escape_char, not %s' % line) # Now walk each locale category. while lines: # Extract any leading comments. header = [] while lines: line = lines[0] if line.startswith('LC_'): break elif not line or line[0] == '%': header.append(line) lines.pop(0) continue else: break _trim_extra_lines(header) if not lines: if header: print('Throwing away trailing lines: %r' % header, file=sys.stderr) return line = lines.pop(0) if line[0:3] != 'LC_': raise LocaleError('Bad line state: %s' % line) cat = line.lower() cat_lines = [] while lines: line = lines.pop(0) if line == 'END %s' % cat.upper(): break cat_lines.append(line) _trim_extra_lines(cat_lines) setattr(self, cat, LocaleCategory(name=cat, content=cat_lines, header=header)) self.categories.append(cat) def read(self, path): """Load the locale file from |path|""" self.readfp(open(path)) def writefp(self, fp): """Write the locale content to |fp|""" if REWRITE_STYLE: header = ['comment_char %', 'escape_char /'] else: header = self.header fp.write('\n'.join(header) + '\n') for category in self.categories: lc = getattr(self, category) fp.write(str(lc)) def write(self, path): """Write the locale content to |path|""" self.writefp(open(path, 'w')) def update_cldr(self, cldr_data): """Merge CLDR updates in to this locale.""" locale = self.name common_dir = cldr_data['dir'] locale_file = os.path.join(common_dir, 'main', '%s.xml' % locale) try: locale_tree = ElementTree.parse(locale_file) except OSError as e: if e.errno == errno.ENOENT: return raise locale_root = locale_tree.getroot() locale_lang_root = locale_root.find('identity/language') lang = locale_lang_root.get('type') locale_territory_root = locale_root.find('identity/territory') territory = locale_territory_root.get('type') chars_file = os.path.join(common_dir, 'supplemental', 'characters.xml') chars_tree = ElementTree.parse(chars_file) chars_root = chars_tree.getroot() english_names_file = os.path.join(common_dir, 'main', 'en.xml') en_tree = ElementTree.parse(english_names_file) en_root = en_tree.getroot() en_names = en_root.find('localeDisplayNames') # First see if the locale has a name before we fall back to the lang. en_langs_root = en_names.find('languages') en_lang_name_root = en_langs_root.find('language[@type="%s"]' % locale) if en_lang_name_root is None: en_lang_name_root = en_langs_root.find('language[@type="%s"]' % lang) en_lang_name = en_lang_name_root.findtext('.') en_territory_name = en_names.find('territories/territory[@type="%s"]' % territory).findtext('.') tele_file = os.path.join(common_dir, 'supplemental', 'telephoneCodeData.xml') tele_tree = ElementTree.parse(tele_file) tele_root = tele_tree.getroot() int_prefix = tele_root.find('telephoneCodeData/codesByTerritory[@territory="%s"]' '/telephoneCountryCode' % territory).get('code') supp_file = os.path.join(common_dir, 'supplemental', 'supplementalData.xml') supp_tree = ElementTree.parse(supp_file) supp_root = supp_tree.getroot() territory_codes = supp_root.find('codeMappings/territoryCodes[@type="%s"]' % territory) country_num = territory_codes.get('numeric') #country_term = territory_codes.get('alpha3').lower() # The xmlpath support in python is not complete, so we need to search for # the currency w/missing @to attribute ourselves. currencies = supp_root.find('currencyData/region[@iso3166="%s"]' % territory) for currency in currencies.getchildren(): if 'to' not in currency.keys(): break else: raise ValueError('Could not find a currency for %s' % territory) int_curr_symbol = currency.get('iso4217') lang_file = os.path.join(common_dir, 'main', '%s.xml' % lang) lang_tree = ElementTree.parse(lang_file) lang_root = lang_tree.getroot() langs_names = lang_root.find('localeDisplayNames') langs_root = langs_names.find('languages') langs_name_root = langs_root.find('language[@type="%s"]' % lang) lang_name = langs_name_root.findtext('.') country_name = langs_names.find('territories/territory[@type="%s"]' % territory).findtext('.') numbers_root = lang_root.find('numbers') symbol_ele = numbers_root.find('currencies/currency[@type="%s"]/symbol' % int_curr_symbol) if symbol_ele is None: for symbol_ele in chars_root.find('characters/character-fallback').getchildren(): if symbol_ele.findtext('substitute') == int_curr_symbol: currency_symbol = symbol_ele.get('value') break else: currency_symbol = '' else: currency_symbol = symbol_ele.findtext('.') # If there's a default labeled, use it. Otherwise just go with the # first one found. It should be the only one. num_sys_ele = numbers_root.find('defaultNumberingSystem') if num_sys_ele is None: num_symbols_root = numbers_root.find('symbols') else: num_symbols_root = numbers_root.find('symbols[@numberSystem="%s"]' % num_sys_ele.findtext('.')) try: decimal_point = num_symbols_root.find('decimal').findtext('.') except AttributeError: decimal_point = '.' try: thousands_sep = num_symbols_root.find('group').findtext('.') except AttributeError: thousands_sep = ',' dates_root = lang_root.find('dates') calendars_root = dates_root.find('calendars') calendar_root = calendars_root.find('calendar[@type="gregorian"]') ABDAYS = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat') abdays_root = wddays_root = None for key in ('stand-alone', 'format'): days_root = calendar_root.find('days/dayContext[@type="%s"]' % key) if days_root is None: continue if abdays_root is None: abdays_root = days_root.find('dayWidth[@type="abbreviated"]') if wddays_root is None: wddays_root = days_root.find('dayWidth[@type="wide"]') if abdays_root is not None: abdays = [abdays_root.find('day[@type="%s"]' % x).findtext('.') for x in ABDAYS] else: abdays = None wddays = [wddays_root.find('day[@type="%s"]' % x).findtext('.') for x in ABDAYS] abmons_root = wdmons_root = None for key in ('stand-alone', 'format'): mons_root = calendar_root.find('months/monthContext[@type="%s"]' % key) if mons_root is None: continue if abmons_root is None: abmons_root = mons_root.find('monthWidth[@type="abbreviated"]') if wdmons_root is None: wdmons_root = mons_root.find('monthWidth[@type="wide"]') if abmons_root is not None: abmons = [abmons_root.find('month[@type="%s"]' % x).findtext('.') for x in range(1, 13)] else: abmons = None wdmons = [wdmons_root.find('month[@type="%s"]' % x).findtext('.') for x in range(1, 13)] # Start updating the actual data. cldr_values = { 'generator': os.path.basename(__file__), 'english_lang_name': en_lang_name, 'english_territory_name': en_territory_name, 'source_name': 'Unicode Common Locale Data Repository (CLDR)', 'source_version': cldr_data['version'], 'source_uri': cldr_data['uri'].replace('/', '//'), 'source_date': cldr_data['date'].strftime('%Y-%m-%d'), 'lang': lang, 'territory': territory, 'locale': locale, 'int_curr_symbol': u_encode(int_curr_symbol + ' '), 'currency_symbol': u_encode(currency_symbol), 'decimal_point': u_encode(decimal_point), 'thousands_sep': u_encode(thousands_sep), #'abday1': u_encode(abdays[0]), #'abday2': u_encode(abdays[1]), #'abday3': u_encode(abdays[2]), #'abday4': u_encode(abdays[3]), #'abday5': u_encode(abdays[4]), #'abday6': u_encode(abdays[5]), #'abday7': u_encode(abdays[6]), #'wideday1': u_encode(wddays[0]), #'wideday2': u_encode(wddays[1]), #'wideday3': u_encode(wddays[2]), #'wideday4': u_encode(wddays[3]), #'wideday5': u_encode(wddays[4]), #'wideday6': u_encode(wddays[5]), #'wideday7': u_encode(wddays[6]), #'abmon1': u_encode(abmons[0]), #'abmon2': u_encode(abmons[1]), #'abmon3': u_encode(abmons[2]), #'abmon4': u_encode(abmons[3]), #'abmon5': u_encode(abmons[4]), #'abmon6': u_encode(abmons[5]), #'abmon7': u_encode(abmons[6]), #'abmon8': u_encode(abmons[7]), #'abmon9': u_encode(abmons[8]), #'abmon10': u_encode(abmons[9]), #'abmon11': u_encode(abmons[10]), #'abmon12': u_encode(abmons[11]), #'widemon1': u_encode(wdmons[0]), #'widemon2': u_encode(wdmons[1]), #'widemon3': u_encode(wdmons[2]), #'widemon4': u_encode(wdmons[3]), #'widemon5': u_encode(wdmons[4]), #'widemon6': u_encode(wdmons[5]), #'widemon7': u_encode(wdmons[6]), #'widemon8': u_encode(wdmons[7]), #'widemon9': u_encode(wdmons[8]), #'widemon10': u_encode(wdmons[9]), #'widemon11': u_encode(wdmons[10]), #'widemon12': u_encode(wdmons[11]), 'country_name': u_encode(country_name), 'country_num': int(country_num), 'lang_name': u_encode(lang_name), 'lang_ab': u_encode(lang), #'lang_term': u_encode('deu'), #'lang_lib': u_encode('ger'), 'int_prefix': u_encode(int_prefix), } all_values = {} d = all_values['lc_identification'] = {}; f = { 'title': '%(english_lang_name)s language locale for %(english_territory_name)s', 'source': '%(source_name)s', #'address': '%(source_uri)s', #'contact': 'http:////cldr.unicode.org//index//process', #'email': 'bug-glibc-locales@gnu.org', #'tel': '', #'fax': '', 'language': '%(english_lang_name)s', 'territory': '%(english_territory_name)s', 'revision': '%(source_version)s', 'date': '%(source_date)s', } d = all_values['lc_ctype'] = { } d = all_values['lc_collate'] = { } d = all_values['lc_time'] = { } if abdays: d['abday'] = ''.join('"%s";' % u_encode(x) for x in abdays)[1:-2] if wddays: d['day'] = ''.join('"%s";' % u_encode(x) for x in wddays)[1:-2] if abmons: d['abmon'] = ''.join('"%s";' % u_encode(x) for x in abmons)[1:-2] if wdmons: d['mon'] = ''.join('"%s";' % u_encode(x) for x in wdmons)[1:-2] d = all_values['lc_numeric'] = { # 'decimal_point': '%(decimal_point)s', # 'thousands_sep': '%(thousands_sep)s', } d = all_values['lc_monetary'] = { # 'int_curr_symbol': '%(int_curr_symbol)s', # 'currency_symbol': '%(currency_symbol)s', } d = all_values['lc_messages'] = { } d = all_values['lc_paper'] = { } d = all_values['lc_name'] = { } d = all_values['lc_address'] = { 'country_name': '%(country_name)s', #'country_num': '%(country_num)i', 'lang_name': '%(lang_name)s', 'lang_ab': '%(lang_ab)s', #'lang_term': '%(lang_term)s', #'lang_lib': '%(lang_lib)s', } d = all_values['lc_telephone'] = { 'int_prefix': '%(int_prefix)s', } d = all_values['lc_measurement'] = { } for category in self.categories: lc = getattr(self, category) values = all_values[category] start_of_line = None full_line = '' i = 0 while i < len(lc.content): line = lc.content[i] if not line: i += 1 continue if line.endswith('/'): if not full_line: start_of_line = i full_line += line[:-1].lstrip() i += 1 continue elif full_line: line = full_line + line.lstrip() full_line = '' key = line.split()[0] if key in values: m = re.match(r'([^"]*)"(.*)"$', line) if m: new_value = values[key] % cldr_values # We should standardize case at some point. if new_value.lower() != m.group(2).lower(): logging.info('%s: %s: changing {%s} to {%s}', self.name, key, u_decode(m.group(2)), u_decode(new_value)) leading_line = m.group(1) # This is tricky as we have to delete most of the # multiline, then update the one remaining. if start_of_line is not None: for _ in range(start_of_line, i): lc.content.pop(start_of_line) i = start_of_line if '";"' in new_value: leading_line = leading_line.rstrip() + '\t' num_tabs = (len(leading_line) // 8) + 1 new_value = new_value.replace( '";"', '";/\n' + ('\t' * num_tabs) + '"') lc.content[i] = '%s"%s"' % (leading_line, new_value) i += 1 def download_cldr(opts): """Download the current cldr database.""" # Set up the working dir. fields = {'version': opts.version} working_dir = opts.working_dir % fields if not os.path.exists(working_dir): os.makedirs(working_dir) # Download the CLDR data. cldr_archive = os.path.join(working_dir, 'core.zip') cldr_uri = CLDR_URI % fields if not os.path.exists(cldr_archive): subprocess.check_call(['wget', '-O', cldr_archive, cldr_uri]) cldr_date = datetime.datetime.fromtimestamp(os.path.getmtime(cldr_archive)) # Unpack the CLDR data. common_dir = os.path.join(working_dir, 'common') if not os.path.exists(common_dir): subprocess.check_call(['unzip', '-u', 'core.zip'], cwd=working_dir) return { 'uri': cldr_uri, 'date': cldr_date, 'dir': common_dir, 'version': opts.version, } def main(argv): parser = get_parser() opts = parser.parse_args(argv) logging_init(opts) cldr_data = download_cldr(opts) for locale in opts.locales: name = os.path.basename(locale) if name.endswith('.new'): continue if name.startswith('iso14651_') or name == 'POSIX': continue logging.info('Updating %s', locale) try: loc = Locale(name=name, path=locale) try: loc.update_cldr(cldr_data) except Exception as e: logging.error('%s: updating failed', locale, exc_info=True) loc.write(locale + '.new') os.rename(locale + '.new', locale) except UnicodeDecodeError: logging.error('%s: bad encodings', locale, exc_info=True) subprocess.check_call(['file', locale]) except (IndexError, LocaleError) as e: logging.error('%s: loading failed', locale, exc_info=True) if __name__ == '__main__': exit(main(sys.argv[1:]))
Attachment:
signature.asc
Description: Digital signature
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |