This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 3/5] localedata: CLDRv28: update LC_ADDRESS.country_name translations


On 09 Feb 2016 09:25, Marko Myllynen wrote:
> On 2016-02-09 09:20, Mike Frysinger wrote:
> > i need to respin this patch due to some of the comments falling out of
> > date.  should i send another patch to just delete these things ?  e.g.
> > files inconstantly do things like:
> > % wa
> > lang_ab      "<U0077><U0061>"
> > 
> > i think we should just scrub these and rely on a simple filter script
> > to view the content.
> 
> This series is a great initiative, thanks for doing this! I wonder how
> have you generated the patches, would it be possible to script any of
> this so that in case CLDR changes in the future we could automatically
> update glibc locales accordingly then?

yes, i've written a python file that downloads the CLDR db, loads the
locale files, and then merges them and checks for differences.  i'm in
the process of trying to extract as much as possible from the CLDR db.
you can find my in-progress work attached, but i'm not posting it for
review yet ;).

i just run it like:
$ ./cldr.py locales/*
-mike
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Written by Mike Frysinger <vapier@gentoo.org> for much great glory.

"""Helper tool for importing current CLDR data.

See http://cldr.unicode.org/ for more details."""

from __future__ import print_function

import argparse
import datetime
import errno
import itertools
import logging
import os
import re
import subprocess
import sys
import time
from xml.etree import ElementTree


# The current release version that we use.
CLDR_VERSION = '28'

# Where to find the CLDR data.
CLDR_URI = 'http://unicode.org/Public/cldr/%(version)s/core.zip'

# Where to store CLDR data files we fetch.
DEFAULT_WORKING_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'cldr-%(version)s')

# Whether we should clean up newlines/comments.
REWRITE_STYLE = False


def u_encode(text):
    """Convert unicode |text| to <U####> format."""
    return ''.join('<U%04X>' % ord(x) for x in text)


def u_decode(text):
    """Convert <U####> format in |text|."""
    def unirep(match):
        return chr(int(match.group(1), 16))
    conv = re.compile(r'<U([0-9A-Fa-f]+)>')
    return conv.sub(unirep, text)


def get_parser():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('--working-dir', default=DEFAULT_WORKING_DIR,
                        help='Where to download CLDR files (default: %(default)s)')
    parser.add_argument('-v', '--version', default=CLDR_VERSION,
                        help='Version of CLDR to use (default: %(default)s)')
    parser.add_argument('locales', nargs='*', help='Locales to generate')
    return parser


def logging_init(debug=False):
    """Set up the logging module."""
    fmt = '%(asctime)s: %(levelname)-7s: '
    fmt += '%(message)s'
    # 'Sat, 05 Oct 2013 18:58:50 -0400 (EST)'
    tzname = time.strftime('%Z', time.localtime())
    datefmt = '%a, %d %b %Y %H:%M:%S ' + tzname
    level = logging.DEBUG if debug else logging.INFO
    handler = logging.StreamHandler(stream=sys.stdout)
    formatter = logging.Formatter(fmt, datefmt)

    handler.setFormatter(formatter)

    logger = logging.getLogger()
    logger.addHandler(handler)
    logger.setLevel(level)


class LocaleError(Exception):
    """Error w/Locale objects"""


class LocaleCategory(object):
    """Content for a single locale category."""

    def __init__(self, name='', content=(), header=()):
        self.name = name.lower()
        self.content = content
        self.header = header

    def __str__(self):
        padding = '\n' if REWRITE_STYLE else ''
        ret = ''
        if self.header:
            ret += padding + '\n'.join(self.header) + '\n'
        NAME = self.name.upper()
        ret += padding + '\n'.join([NAME] + self.content + ['END %s' % NAME]) + '\n'
        return ret


class Locale(object):
    """Content for a locale file itself."""

    def __init__(self, name=None, path=None):
        self.name = name
        self.header = []
        self.lc_identification = []
        self.lc_ctype = []
        self.lc_collate = []
        self.lc_time = []
        self.lc_numeric = []
        self.lc_monetary = []
        self.lc_messages = []
        self.lc_paper = []
        self.lc_name = []
        self.lc_address = []
        self.lc_telephone = []
        self.lc_measurement = []
        self.categories = []
        self.cldr = None

        if path is not None:
            self.read(path)

    def readfp(self, fp):
        """Load the locale content from |fp|"""
        def _trim_extra_lines(lines, leading=True, trailing=True,
                              consecutive=True, comments=False):
            if not REWRITE_STYLE:
                return lines

            # Clear leading blank lines.
            if leading:
                while lines and not lines[0]:
                    lines.pop(0)

            # Clear trailing blank lines.
            if trailing:
                while lines and not lines[-1]:
                    lines.pop(-1)

            # Clear consecutive blank lines.
            if consecutive:
                i = 0
                while i < len(lines) - 1:
                    if not lines[i] and not lines[i + 1]:
                        lines.pop(i)
                    else:
                        i += 1

            # Trim blank comment lines that start/end a section.
            if comments:
                i = 0
                while i < len(lines):
                    if (lines[i] == '%' and
                        (i == 0 or not lines[i - 1] or lines[i - 1][0] != '%')):
                        lines.pop(i)
                    elif (lines[i] == '%' and
                          (i == len(lines) - 1 or not lines[i + 1] or
                           lines[i + 1][0] != '%')):
                        lines.pop(i)
                    else:
                        i += 1

            return lines

        lines = [x.rstrip() for x in fp.readlines()]
        _trim_extra_lines(lines)

        # Process the leading few lines.
        comment_line = lines.pop(0)
        escape_line = lines.pop(0)
        self.header = [comment_line, escape_line]
        if escape_line.startswith('comment_char'):
            escape_line, comment_line = comment_line, escape_line

        line = comment_line
        if line.startswith('comment_char'):
            if line.split()[1] != '%':
                raise LocaleError('Bad comment_char: %s' % line)
        else:
            raise LocaleError('Second line should be comment_char, not %s' % line)

        line = escape_line
        if line.startswith('escape_char'):
            if line.split()[1] != '/':
                raise LocaleError('Bad escape_char: %s' % line)
        else:
            raise LocaleError('First line should be escape_char, not %s' % line)

        # Now walk each locale category.
        while lines:
            # Extract any leading comments.
            header = []
            while lines:
                line = lines[0]
                if line.startswith('LC_'):
                    break
                elif not line or line[0] == '%':
                    header.append(line)
                    lines.pop(0)
                    continue
                else:
                    break
            _trim_extra_lines(header)

            if not lines:
                if header:
                    print('Throwing away trailing lines: %r' % header,
                          file=sys.stderr)
                return

            line = lines.pop(0)
            if line[0:3] != 'LC_':
                raise LocaleError('Bad line state: %s' % line)

            cat = line.lower()
            cat_lines = []
            while lines:
                line = lines.pop(0)
                if line == 'END %s' % cat.upper():
                    break
                cat_lines.append(line)
            _trim_extra_lines(cat_lines)
            setattr(self, cat, LocaleCategory(name=cat, content=cat_lines, header=header))
            self.categories.append(cat)

    def read(self, path):
        """Load the locale file from |path|"""
        self.readfp(open(path))

    def writefp(self, fp):
        """Write the locale content to |fp|"""
        if REWRITE_STYLE:
            header = ['comment_char %', 'escape_char /']
        else:
            header = self.header
        fp.write('\n'.join(header) + '\n')

        for category in self.categories:
            lc = getattr(self, category)
            fp.write(str(lc))

    def write(self, path):
        """Write the locale content to |path|"""
        self.writefp(open(path, 'w'))

    def update_cldr(self, cldr_data):
        """Merge CLDR updates in to this locale."""
        locale = self.name
        common_dir = cldr_data['dir']

        locale_file = os.path.join(common_dir, 'main', '%s.xml' % locale)
        try:
            locale_tree = ElementTree.parse(locale_file)
        except OSError as e:
            if e.errno == errno.ENOENT:
                return
            raise
        locale_root = locale_tree.getroot()

        locale_lang_root = locale_root.find('identity/language')
        lang = locale_lang_root.get('type')
        locale_territory_root = locale_root.find('identity/territory')
        territory = locale_territory_root.get('type')

        chars_file = os.path.join(common_dir, 'supplemental', 'characters.xml')
        chars_tree = ElementTree.parse(chars_file)
        chars_root = chars_tree.getroot()

        english_names_file = os.path.join(common_dir, 'main', 'en.xml')
        en_tree = ElementTree.parse(english_names_file)
        en_root = en_tree.getroot()
        en_names = en_root.find('localeDisplayNames')
        # First see if the locale has a name before we fall back to the lang.
        en_langs_root = en_names.find('languages')
        en_lang_name_root = en_langs_root.find('language[@type="%s"]' % locale)
        if en_lang_name_root is None:
            en_lang_name_root = en_langs_root.find('language[@type="%s"]' % lang)
        en_lang_name = en_lang_name_root.findtext('.')

        en_territory_name = en_names.find('territories/territory[@type="%s"]' %
                                          territory).findtext('.')

        tele_file = os.path.join(common_dir, 'supplemental', 'telephoneCodeData.xml')
        tele_tree = ElementTree.parse(tele_file)
        tele_root = tele_tree.getroot()
        int_prefix = tele_root.find('telephoneCodeData/codesByTerritory[@territory="%s"]'
                                    '/telephoneCountryCode' % territory).get('code')

        supp_file = os.path.join(common_dir, 'supplemental', 'supplementalData.xml')
        supp_tree = ElementTree.parse(supp_file)
        supp_root = supp_tree.getroot()
        territory_codes = supp_root.find('codeMappings/territoryCodes[@type="%s"]' % territory)
        country_num = territory_codes.get('numeric')
        #country_term = territory_codes.get('alpha3').lower()

        # The xmlpath support in python is not complete, so we need to search for
        # the currency w/missing @to attribute ourselves.
        currencies = supp_root.find('currencyData/region[@iso3166="%s"]' % territory)
        for currency in currencies.getchildren():
            if 'to' not in currency.keys():
                break
        else:
            raise ValueError('Could not find a currency for %s' % territory)
        int_curr_symbol = currency.get('iso4217')

        lang_file = os.path.join(common_dir, 'main', '%s.xml' % lang)
        lang_tree = ElementTree.parse(lang_file)
        lang_root = lang_tree.getroot()
        langs_names = lang_root.find('localeDisplayNames')
        langs_root = langs_names.find('languages')
        langs_name_root = langs_root.find('language[@type="%s"]' % lang)
        lang_name = langs_name_root.findtext('.')

        country_name = langs_names.find('territories/territory[@type="%s"]' %
                                        territory).findtext('.')

        numbers_root = lang_root.find('numbers')
        symbol_ele = numbers_root.find('currencies/currency[@type="%s"]/symbol'
                                       % int_curr_symbol)
        if symbol_ele is None:
            for symbol_ele in chars_root.find('characters/character-fallback').getchildren():
                if symbol_ele.findtext('substitute') == int_curr_symbol:
                    currency_symbol = symbol_ele.get('value')
                    break
            else:
                currency_symbol = ''
        else:
            currency_symbol = symbol_ele.findtext('.')

        # If there's a default labeled, use it.  Otherwise just go with the
        # first one found.  It should be the only one.
        num_sys_ele = numbers_root.find('defaultNumberingSystem')
        if num_sys_ele is None:
            num_symbols_root = numbers_root.find('symbols')
        else:
            num_symbols_root = numbers_root.find('symbols[@numberSystem="%s"]' %
                                                 num_sys_ele.findtext('.'))
        try:
            decimal_point = num_symbols_root.find('decimal').findtext('.')
        except AttributeError:
            decimal_point = '.'
        try:
            thousands_sep = num_symbols_root.find('group').findtext('.')
        except AttributeError:
            thousands_sep = ','

        dates_root = lang_root.find('dates')
        calendars_root = dates_root.find('calendars')
        calendar_root = calendars_root.find('calendar[@type="gregorian"]')

        ABDAYS = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
        abdays_root = wddays_root = None
        for key in ('stand-alone', 'format'):
            days_root = calendar_root.find('days/dayContext[@type="%s"]' % key)
            if days_root is None:
                continue
            if abdays_root is None:
                abdays_root = days_root.find('dayWidth[@type="abbreviated"]')
            if wddays_root is None:
                wddays_root = days_root.find('dayWidth[@type="wide"]')
        if abdays_root is not None:
            abdays = [abdays_root.find('day[@type="%s"]' % x).findtext('.')
                      for x in ABDAYS]
        else:
            abdays = None
        wddays = [wddays_root.find('day[@type="%s"]' % x).findtext('.')
                    for x in ABDAYS]

        abmons_root = wdmons_root = None
        for key in ('stand-alone', 'format'):
            mons_root = calendar_root.find('months/monthContext[@type="%s"]' % key)
            if mons_root is None:
                continue
            if abmons_root is None:
                abmons_root = mons_root.find('monthWidth[@type="abbreviated"]')
            if wdmons_root is None:
                wdmons_root = mons_root.find('monthWidth[@type="wide"]')
        if abmons_root is not None:
            abmons = [abmons_root.find('month[@type="%s"]' % x).findtext('.')
                      for x in range(1, 13)]
        else:
            abmons = None
        wdmons = [wdmons_root.find('month[@type="%s"]' % x).findtext('.')
                    for x in range(1, 13)]

        # Start updating the actual data.
        cldr_values = {
            'generator': os.path.basename(__file__),
            'english_lang_name': en_lang_name,
            'english_territory_name': en_territory_name,
            'source_name': 'Unicode Common Locale Data Repository (CLDR)',
            'source_version': cldr_data['version'],
            'source_uri': cldr_data['uri'].replace('/', '//'),
            'source_date': cldr_data['date'].strftime('%Y-%m-%d'),
            'lang': lang,
            'territory': territory,
            'locale': locale,
            'int_curr_symbol': u_encode(int_curr_symbol + ' '),
            'currency_symbol': u_encode(currency_symbol),
            'decimal_point': u_encode(decimal_point),
            'thousands_sep': u_encode(thousands_sep),
            #'abday1': u_encode(abdays[0]),
            #'abday2': u_encode(abdays[1]),
            #'abday3': u_encode(abdays[2]),
            #'abday4': u_encode(abdays[3]),
            #'abday5': u_encode(abdays[4]),
            #'abday6': u_encode(abdays[5]),
            #'abday7': u_encode(abdays[6]),
            #'wideday1': u_encode(wddays[0]),
            #'wideday2': u_encode(wddays[1]),
            #'wideday3': u_encode(wddays[2]),
            #'wideday4': u_encode(wddays[3]),
            #'wideday5': u_encode(wddays[4]),
            #'wideday6': u_encode(wddays[5]),
            #'wideday7': u_encode(wddays[6]),
            #'abmon1': u_encode(abmons[0]),
            #'abmon2': u_encode(abmons[1]),
            #'abmon3': u_encode(abmons[2]),
            #'abmon4': u_encode(abmons[3]),
            #'abmon5': u_encode(abmons[4]),
            #'abmon6': u_encode(abmons[5]),
            #'abmon7': u_encode(abmons[6]),
            #'abmon8': u_encode(abmons[7]),
            #'abmon9': u_encode(abmons[8]),
            #'abmon10': u_encode(abmons[9]),
            #'abmon11': u_encode(abmons[10]),
            #'abmon12': u_encode(abmons[11]),
            #'widemon1': u_encode(wdmons[0]),
            #'widemon2': u_encode(wdmons[1]),
            #'widemon3': u_encode(wdmons[2]),
            #'widemon4': u_encode(wdmons[3]),
            #'widemon5': u_encode(wdmons[4]),
            #'widemon6': u_encode(wdmons[5]),
            #'widemon7': u_encode(wdmons[6]),
            #'widemon8': u_encode(wdmons[7]),
            #'widemon9': u_encode(wdmons[8]),
            #'widemon10': u_encode(wdmons[9]),
            #'widemon11': u_encode(wdmons[10]),
            #'widemon12': u_encode(wdmons[11]),
            'country_name': u_encode(country_name),
            'country_num': int(country_num),
            'lang_name': u_encode(lang_name),
            'lang_ab': u_encode(lang),
            #'lang_term': u_encode('deu'),
            #'lang_lib': u_encode('ger'),
            'int_prefix': u_encode(int_prefix),
        }

        all_values = {}
        d = all_values['lc_identification'] = {}; f = {
            'title':     '%(english_lang_name)s language locale for %(english_territory_name)s',
            'source':    '%(source_name)s',
            #'address':   '%(source_uri)s',
            #'contact':   'http:////cldr.unicode.org//index//process',
            #'email':     'bug-glibc-locales@gnu.org',
            #'tel':       '',
            #'fax':       '',
            'language':  '%(english_lang_name)s',
            'territory': '%(english_territory_name)s',
            'revision':  '%(source_version)s',
            'date':      '%(source_date)s',
        }
        d = all_values['lc_ctype'] = {
        }
        d = all_values['lc_collate'] = {
        }
        d = all_values['lc_time'] = {
        }
        if abdays:
            d['abday'] = ''.join('"%s";' % u_encode(x) for x in abdays)[1:-2]
        if wddays:
            d['day']   = ''.join('"%s";' % u_encode(x) for x in wddays)[1:-2]
        if abmons:
            d['abmon'] = ''.join('"%s";' % u_encode(x) for x in abmons)[1:-2]
        if wdmons:
            d['mon']   = ''.join('"%s";' % u_encode(x) for x in wdmons)[1:-2]
        d = all_values['lc_numeric'] = {
#            'decimal_point': '%(decimal_point)s',
#            'thousands_sep': '%(thousands_sep)s',
        }
        d = all_values['lc_monetary'] = {
#            'int_curr_symbol': '%(int_curr_symbol)s',
#            'currency_symbol': '%(currency_symbol)s',
        }
        d = all_values['lc_messages'] = {
        }
        d = all_values['lc_paper'] = {
        }
        d = all_values['lc_name'] = {
        }
        d = all_values['lc_address'] = {
            'country_name': '%(country_name)s',
            #'country_num': '%(country_num)i',
            'lang_name': '%(lang_name)s',
            'lang_ab': '%(lang_ab)s',
            #'lang_term': '%(lang_term)s',
            #'lang_lib': '%(lang_lib)s',
        }
        d = all_values['lc_telephone'] = {
            'int_prefix': '%(int_prefix)s',
        }
        d = all_values['lc_measurement'] = {
        }
        for category in self.categories:
            lc = getattr(self, category)
            values = all_values[category]

            start_of_line = None
            full_line = ''
            i = 0
            while i < len(lc.content):
                line = lc.content[i]
                if not line:
                    i += 1
                    continue

                if line.endswith('/'):
                    if not full_line:
                        start_of_line = i
                    full_line += line[:-1].lstrip()
                    i += 1
                    continue
                elif full_line:
                    line = full_line + line.lstrip()
                    full_line = ''

                key = line.split()[0]
                if key in values:
                    m = re.match(r'([^"]*)"(.*)"$', line)
                    if m:
                        new_value = values[key] % cldr_values
                        # We should standardize case at some point.
                        if new_value.lower() != m.group(2).lower():
                            logging.info('%s: %s: changing {%s} to {%s}',
                                         self.name, key,
                                         u_decode(m.group(2)),
                                         u_decode(new_value))
                            leading_line = m.group(1)
                            # This is tricky as we have to delete most of the
                            # multiline, then update the one remaining.
                            if start_of_line is not None:
                                for _ in range(start_of_line, i):
                                    lc.content.pop(start_of_line)
                                i = start_of_line
                                if '";"' in new_value:
                                    leading_line = leading_line.rstrip() + '\t'
                                    num_tabs = (len(leading_line) // 8) + 1
                                    new_value = new_value.replace(
                                        '";"',
                                        '";/\n' + ('\t' * num_tabs) + '"')
                            lc.content[i] = '%s"%s"' % (leading_line, new_value)

                i += 1


def download_cldr(opts):
    """Download the current cldr database."""
    # Set up the working dir.
    fields = {'version': opts.version}
    working_dir = opts.working_dir % fields
    if not os.path.exists(working_dir):
        os.makedirs(working_dir)

    # Download the CLDR data.
    cldr_archive = os.path.join(working_dir, 'core.zip')
    cldr_uri = CLDR_URI % fields
    if not os.path.exists(cldr_archive):
        subprocess.check_call(['wget', '-O', cldr_archive, cldr_uri])
    cldr_date = datetime.datetime.fromtimestamp(os.path.getmtime(cldr_archive))

    # Unpack the CLDR data.
    common_dir = os.path.join(working_dir, 'common')
    if not os.path.exists(common_dir):
        subprocess.check_call(['unzip', '-u', 'core.zip'], cwd=working_dir)

    return {
        'uri': cldr_uri,
        'date': cldr_date,
        'dir': common_dir,
        'version': opts.version,
    }


def main(argv):
    parser = get_parser()
    opts = parser.parse_args(argv)
    logging_init(opts)

    cldr_data = download_cldr(opts)

    for locale in opts.locales:
        name = os.path.basename(locale)
        if name.endswith('.new'):
            continue
        if name.startswith('iso14651_') or name == 'POSIX':
            continue

        logging.info('Updating %s', locale)
        try:
            loc = Locale(name=name, path=locale)
            try:
                loc.update_cldr(cldr_data)
            except Exception as e:
                logging.error('%s: updating failed', locale, exc_info=True)
            loc.write(locale + '.new')
            os.rename(locale + '.new', locale)
        except UnicodeDecodeError:
            logging.error('%s: bad encodings', locale, exc_info=True)
            subprocess.check_call(['file', locale])
        except (IndexError, LocaleError) as e:
            logging.error('%s: loading failed', locale, exc_info=True)


if __name__ == '__main__':
    exit(main(sys.argv[1:]))

Attachment: signature.asc
Description: Digital signature


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]