587 lines
14 KiB
Perl
Executable File
587 lines
14 KiB
Perl
Executable File
#! /usr/local/bin/perl -w
|
|
# $Id: generate_normalize_data.pl,v 1.1 2003/06/04 00:27:55 marka Exp $
|
|
#
|
|
# Copyright (c) 2000,2001 Japan Network Information Center.
|
|
# All rights reserved.
|
|
#
|
|
# By using this file, you agree to the terms and conditions set forth bellow.
|
|
#
|
|
# LICENSE TERMS AND CONDITIONS
|
|
#
|
|
# The following License Terms and Conditions apply, unless a different
|
|
# license is obtained from Japan Network Information Center ("JPNIC"),
|
|
# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
|
|
# Chiyoda-ku, Tokyo 101-0047, Japan.
|
|
#
|
|
# 1. Use, Modification and Redistribution (including distribution of any
|
|
# modified or derived work) in source and/or binary forms is permitted
|
|
# under this License Terms and Conditions.
|
|
#
|
|
# 2. Redistribution of source code must retain the copyright notices as they
|
|
# appear in each source code file, this License Terms and Conditions.
|
|
#
|
|
# 3. Redistribution in binary form must reproduce the Copyright Notice,
|
|
# this License Terms and Conditions, in the documentation and/or other
|
|
# materials provided with the distribution. For the purposes of binary
|
|
# distribution the "Copyright Notice" refers to the following language:
|
|
# "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
|
|
#
|
|
# 4. The name of JPNIC may not be used to endorse or promote products
|
|
# derived from this Software without specific prior written approval of
|
|
# JPNIC.
|
|
#
|
|
# 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
|
|
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
|
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
|
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
|
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
|
|
#
|
|
|
|
#
|
|
# Generate lib/unicodedata.c from UnicodeData.txt,
|
|
# CompositionExclusions-1.txt, SpecialCasing.txt and CaseFolding.txt,
|
|
# all of them available from ftp://ftp.unicode.org/Public/UNIDATA/.
|
|
#
|
|
|
|
use strict;
|
|
use lib qw(.);
|
|
|
|
use Getopt::Long;
|
|
use UCD;
|
|
use SparseMap;
|
|
|
|
use constant UCS_MAX => 0x110000;
|
|
use constant END_BIT => 0x80000000;
|
|
|
|
my $DECOMP_COMPAT_BIT = 0x8000;
|
|
|
|
my $CASEMAP_FINAL_BIT = 0x1;
|
|
my $CASEMAP_NONFINAL_BIT = 0x2;
|
|
my $CASEMAP_LAST_BIT = 0x10;
|
|
|
|
my $LETTER_BIT = 1;
|
|
my $NSPMARK_BIT = 2;
|
|
|
|
(my $myid = '$Id: generate_normalize_data.pl,v 1.1 2003/06/04 00:27:55 marka Exp $') =~ s/\$([^\$]+)\$/\$-$1-\$/;
|
|
|
|
my @default_bits = (9, 7, 5);
|
|
#my @default_bits = (7, 7, 7);
|
|
my @canon_class_bits = @default_bits;
|
|
my @decomp_bits = @default_bits;
|
|
my @comp_bits = @default_bits;
|
|
my @folding_bits = @default_bits;
|
|
my @casemap_bits = @default_bits;
|
|
my @casemap_ctx_bits = @default_bits;
|
|
|
|
my $prefix = '';
|
|
my $dir = '.';
|
|
my $unicodedatafile = 'UnicodeData.txt';
|
|
my $exclusionfile = 'CompositionExclusions.txt';
|
|
my $specialcasefile = 'SpecialCasing.txt';
|
|
my $casefoldingfile = 'CaseFolding.txt';
|
|
my $verbose;
|
|
|
|
GetOptions('dir|d=s' => \$dir,
|
|
'unicodedata|u=s' => \$unicodedatafile,
|
|
'exclude|e=s' => \$exclusionfile,
|
|
'specialcase|s=s' => \$specialcasefile,
|
|
'casefold|c=s' => \$casefoldingfile,
|
|
'prefix|p=s' => \$prefix,
|
|
'verbose|v' => \$verbose,
|
|
) or usage();
|
|
|
|
foreach my $r (\$unicodedatafile, \$exclusionfile,
|
|
\$specialcasefile, \$casefoldingfile) {
|
|
$$r = "$dir/$$r" unless $$r =~ m|^/|;
|
|
}
|
|
|
|
my %exclusions;
|
|
my %lower_special;
|
|
my %upper_special;
|
|
|
|
my @decomp_data;
|
|
my @comp_data;
|
|
my @toupper_data;
|
|
my @tolower_data;
|
|
my @folding_data;
|
|
|
|
#
|
|
# Create Mapping/Bitmap objects.
|
|
#
|
|
|
|
# canonical class
|
|
my $canon_class = SparseMap::Int->new(BITS => [@canon_class_bits],
|
|
MAX => UCS_MAX,
|
|
MAPALL => 1,
|
|
DEFAULT => 0);
|
|
|
|
# canonical/compatibility decomposition
|
|
my $decomp = SparseMap::Int->new(BITS => [@decomp_bits],
|
|
MAX => UCS_MAX,
|
|
MAPALL => 1,
|
|
DEFAULT => 0);
|
|
|
|
# canonical composition
|
|
my $comp = SparseMap::Int->new(BITS => [@comp_bits],
|
|
MAX => UCS_MAX,
|
|
MAPALL => 1,
|
|
DEFAULT => 0);
|
|
|
|
# uppercase/lowercase
|
|
my $upper = SparseMap::Int->new(BITS => [@casemap_bits],
|
|
MAX => UCS_MAX,
|
|
MAPALL => 1,
|
|
DEFAULT => 0);
|
|
my $lower = SparseMap::Int->new(BITS => [@casemap_bits],
|
|
MAX => UCS_MAX,
|
|
MAPALL => 1,
|
|
DEFAULT => 0);
|
|
|
|
# final/nonfinal context
|
|
my $casemap_ctx = SparseMap::Int->new(BITS => [@casemap_ctx_bits],
|
|
MAX => UCS_MAX,
|
|
MAPALL => 1,
|
|
DEFAULT => 0);
|
|
|
|
# casefolding
|
|
my $folding = SparseMap::Int->new(BITS => [@folding_bits],
|
|
MAX => UCS_MAX,
|
|
MAPALL => 1,
|
|
DEFAULT => 0);
|
|
|
|
#
|
|
# Read datafiles.
|
|
#
|
|
|
|
read_exclusion_file();
|
|
read_specialcasing_file();
|
|
read_unicodedata_file();
|
|
read_casefolding_file();
|
|
|
|
print_header();
|
|
print_canon_class();
|
|
print_composition();
|
|
print_decomposition();
|
|
print_casemap();
|
|
print_casemap_context();
|
|
print_casefolding();
|
|
|
|
exit;
|
|
|
|
sub usage {
|
|
print STDERR <<"END";
|
|
Usage: $0 [options..]
|
|
options:
|
|
-d DIR directory where Unicode Character Data files resides [./]
|
|
-u FILE name of the UnicodeData file [UnicodeData.txt]
|
|
-e FILE name of the CompositionExclusion file [CompositionExclusions-1.txt]
|
|
-s FILE name of the SpecialCasing file [SpecialCasing.txt]
|
|
-c FILE name of the CaseFolding file [CaseFolding.txt]
|
|
END
|
|
exit 1;
|
|
}
|
|
|
|
#
|
|
# read_exclusion_file -- read CompositionExclusions-1.txt.
|
|
#
|
|
sub read_exclusion_file {
|
|
open EXCLUDE, $exclusionfile or die "cannot open $exclusionfile: $!\n";
|
|
while ($_ = UCD::CompositionExclusions::getline(\*EXCLUDE)) {
|
|
my %data = UCD::CompositionExclusions::parseline($_);
|
|
$exclusions{$data{CODE}} = 1;
|
|
}
|
|
close EXCLUDE;
|
|
}
|
|
|
|
#
|
|
# read_specialcasing_file -- read SpecialCasing.txt
|
|
#
|
|
sub read_specialcasing_file {
|
|
open SPCASE, $specialcasefile or die "cannot open $specialcasefile: $!\n";
|
|
while ($_ = UCD::SpecialCasing::getline(\*SPCASE)) {
|
|
my %data = UCD::SpecialCasing::parseline($_);
|
|
my $code = $data{CODE};
|
|
my $lower = $data{LOWER};
|
|
my $upper = $data{UPPER};
|
|
my $cond = $data{CONDITION} || '';
|
|
|
|
next unless $cond eq '' or $cond =~ /^(NON_)?FINAL/;
|
|
|
|
if (defined $cond && (@$lower > 1 || $lower->[0] != $code)
|
|
or @$lower > 1 or $lower->[0] != $code) {
|
|
$lower_special{$code} = [$lower, $cond];
|
|
}
|
|
if (defined $cond && (@$upper > 1 || $upper->[0] != $code)
|
|
or @$upper > 1 or $upper->[0] != $code) {
|
|
$upper_special{$code} = [$upper, $cond];
|
|
}
|
|
}
|
|
close SPCASE;
|
|
}
|
|
|
|
#
|
|
# read_unicodedata_file -- read UnicodeData.txt
|
|
#
|
|
sub read_unicodedata_file {
|
|
open UCD, $unicodedatafile or die "cannot open $unicodedatafile: $!\n";
|
|
|
|
@decomp_data = (0);
|
|
@toupper_data = (0);
|
|
@tolower_data = (0);
|
|
|
|
my @comp_cand; # canonical composition candidates
|
|
my %nonstarter;
|
|
|
|
while ($_ = UCD::UnicodeData::getline(\*UCD)) {
|
|
my %data = UCD::UnicodeData::parseline($_);
|
|
my $code = $data{CODE};
|
|
|
|
# combining class
|
|
if ($data{CLASS} > 0) {
|
|
$nonstarter{$code} = 1;
|
|
$canon_class->add($code, $data{CLASS});
|
|
}
|
|
|
|
# uppercasing
|
|
if (exists $upper_special{$code} or defined $data{UPPER}) {
|
|
my $offset = @toupper_data;
|
|
my @casedata;
|
|
|
|
$upper->add($code, $offset);
|
|
if (exists $upper_special{$code}) {
|
|
push @casedata, $upper_special{$code};
|
|
}
|
|
if (defined $data{UPPER}) {
|
|
push @casedata, $data{UPPER};
|
|
}
|
|
push @toupper_data, casemap_data(@casedata);
|
|
}
|
|
|
|
# lowercasing
|
|
if (exists $lower_special{$code} or defined $data{LOWER}) {
|
|
my $offset = @tolower_data;
|
|
my @casedata;
|
|
|
|
$lower->add($code, $offset);
|
|
if (exists $lower_special{$code}) {
|
|
push @casedata, $lower_special{$code};
|
|
}
|
|
if (defined $data{LOWER}) {
|
|
push @casedata, $data{LOWER};
|
|
}
|
|
push @tolower_data, casemap_data(@casedata);
|
|
}
|
|
|
|
# composition/decomposition
|
|
if ($data{DECOMP}) {
|
|
my ($tag, @decomp) = @{$data{DECOMP}};
|
|
my $offset = @decomp_data;
|
|
|
|
# composition
|
|
if ($tag eq '' and @decomp > 1 and not exists $exclusions{$code}) {
|
|
# canonical composition candidate
|
|
push @comp_cand, [$code, @decomp];
|
|
}
|
|
|
|
# decomposition
|
|
if ($tag ne '') {
|
|
# compatibility decomposition
|
|
$offset |= $DECOMP_COMPAT_BIT;
|
|
}
|
|
$decomp->add($code, $offset);
|
|
push @decomp_data, @decomp;
|
|
$decomp_data[-1] |= END_BIT;
|
|
|
|
}
|
|
|
|
# final/nonfinal context
|
|
if ($data{CATEGORY} =~ /L[ult]/) {
|
|
$casemap_ctx->add($code, $LETTER_BIT);
|
|
} elsif ($data{CATEGORY} eq 'Mn') {
|
|
$casemap_ctx->add($code, $NSPMARK_BIT);
|
|
}
|
|
}
|
|
close UCD;
|
|
|
|
# Eliminate composition candidates whose decomposition starts with
|
|
# a non-starter.
|
|
@comp_cand = grep {not exists $nonstarter{$_->[1]}} @comp_cand;
|
|
|
|
@comp_data = ([0, 0, 0]);
|
|
my $last_code = -1;
|
|
my $last_offset = @comp_data;
|
|
for my $r (sort {$a->[1] <=> $b->[1] || $a->[2] <=> $b->[2]} @comp_cand) {
|
|
if ($r->[1] != $last_code) {
|
|
$comp->add($last_code,
|
|
($last_offset | ((@comp_data - $last_offset)<<16)))
|
|
unless $last_code == -1;
|
|
$last_code = $r->[1];
|
|
$last_offset = @comp_data;
|
|
}
|
|
push @comp_data, $r;
|
|
}
|
|
$comp->add($last_code,
|
|
($last_offset | ((@comp_data - $last_offset)<<16)));
|
|
}
|
|
|
|
sub casemap_data {
|
|
my @data = @_;
|
|
my @result = ();
|
|
while (@data > 0) {
|
|
my $r = shift @data;
|
|
my $flag = 0;
|
|
if (ref $r) {
|
|
if ($r->[1] eq 'FINAL') {
|
|
$flag |= $CASEMAP_FINAL_BIT;
|
|
} elsif ($r->[1] eq 'NON_FINAL') {
|
|
$flag |= $CASEMAP_NONFINAL_BIT;
|
|
} elsif ($r->[1] ne '') {
|
|
die "unknown condition \"", $r->[1], "\"\n";
|
|
}
|
|
}
|
|
$flag |= $CASEMAP_LAST_BIT if @data == 0;
|
|
push @result, $flag;
|
|
push @result, (ref $r) ? @{$r->[0]} : $r;
|
|
$result[-1] |= END_BIT;
|
|
}
|
|
@result;
|
|
}
|
|
|
|
#
|
|
# read_casefolding_file -- read CaseFolding.txt
|
|
#
|
|
sub read_casefolding_file {
|
|
open FOLD, $casefoldingfile or die "cannto open $casefoldingfile: $!\n";
|
|
|
|
# dummy.
|
|
@folding_data = (0);
|
|
|
|
while ($_ = UCD::CaseFolding::getline(\*FOLD)) {
|
|
my %data = UCD::CaseFolding::parseline($_);
|
|
|
|
$folding->add($data{CODE}, scalar(@folding_data));
|
|
push @folding_data, @{$data{MAP}};
|
|
$folding_data[-1] |= END_BIT;
|
|
}
|
|
close FOLD;
|
|
}
|
|
|
|
sub print_header {
|
|
print <<"END";
|
|
/* \$Id\$ */
|
|
/* $myid */
|
|
/*
|
|
* Do not edit this file!
|
|
* This file is generated from UnicodeData.txt, CompositionExclusions-1.txt,
|
|
* SpecialCasing.txt and CaseFolding.txt.
|
|
*/
|
|
|
|
END
|
|
}
|
|
|
|
#
|
|
# print_canon_class -- generate data for canonical class
|
|
#
|
|
sub print_canon_class {
|
|
$canon_class->fix();
|
|
print STDERR "** cannon_class\n", $canon_class->stat() if $verbose;
|
|
|
|
print <<"END";
|
|
|
|
/*
|
|
* Canonical Class
|
|
*/
|
|
|
|
END
|
|
print_bits("CANON_CLASS", @canon_class_bits);
|
|
print "\n";
|
|
print $canon_class->cprog(NAME => "${prefix}canon_class");
|
|
}
|
|
|
|
#
|
|
# print_composition -- generate data for canonical composition
|
|
#
|
|
sub print_composition {
|
|
$comp->fix();
|
|
print STDERR "** composition\n", $comp->stat() if $verbose;
|
|
|
|
print <<"END";
|
|
|
|
/*
|
|
* Canonical Composition
|
|
*/
|
|
|
|
END
|
|
print_bits("CANON_COMPOSE", @comp_bits);
|
|
print "\n";
|
|
print $comp->cprog(NAME => "${prefix}compose");
|
|
print <<"END";
|
|
|
|
static const struct composition ${prefix}compose_seq[] = {
|
|
END
|
|
my $i = 0;
|
|
foreach my $r (@comp_data) {
|
|
if ($i % 2 == 0) {
|
|
print "\n" if $i != 0;
|
|
print "\t";
|
|
}
|
|
printf "{ 0x%08x, 0x%08x }, ", $r->[2], $r->[0];
|
|
$i++;
|
|
}
|
|
print "\n};\n\n";
|
|
}
|
|
|
|
#
|
|
# print_decomposition -- generate data for canonical/compatibility
|
|
# decomposition
|
|
#
|
|
sub print_decomposition {
|
|
$decomp->fix();
|
|
print STDERR "** decomposition\n", $decomp->stat() if $verbose;
|
|
|
|
print <<"END";
|
|
|
|
/*
|
|
* Canonical/Compatibility Decomposition
|
|
*/
|
|
|
|
END
|
|
print_bits("DECOMP", @decomp_bits);
|
|
print "#define DECOMP_COMPAT\t$DECOMP_COMPAT_BIT\n\n";
|
|
|
|
print $decomp->cprog(NAME => "${prefix}decompose");
|
|
|
|
print "static const unsigned long ${prefix}decompose_seq[] = {\n";
|
|
print_ulseq(@decomp_data);
|
|
print "};\n\n";
|
|
}
|
|
|
|
#
|
|
# print_casemap -- generate data for case mapping
|
|
#
|
|
sub print_casemap {
|
|
$upper->fix();
|
|
$lower->fix();
|
|
print STDERR "** upper mapping\n", $upper->stat() if $verbose;
|
|
print STDERR "** lower mapping\n", $lower->stat() if $verbose;
|
|
|
|
print <<"END";
|
|
|
|
/*
|
|
* Lowercase <-> Uppercase mapping
|
|
*/
|
|
|
|
/*
|
|
* Flags for special case mapping.
|
|
*/
|
|
#define CMF_FINAL $CASEMAP_FINAL_BIT
|
|
#define CMF_NONFINAL $CASEMAP_NONFINAL_BIT
|
|
#define CMF_LAST $CASEMAP_LAST_BIT
|
|
#define CMF_CTXDEP (CMF_FINAL|CMF_NONFINAL)
|
|
|
|
END
|
|
print_bits("CASEMAP", @casemap_bits);
|
|
print "\n";
|
|
print $upper->cprog(NAME => "${prefix}toupper");
|
|
print $lower->cprog(NAME => "${prefix}tolower");
|
|
|
|
print "static const unsigned long ${prefix}toupper_seq[] = {\n";
|
|
print_ulseq(@toupper_data);
|
|
print "};\n\n";
|
|
|
|
print "static const unsigned long ${prefix}tolower_seq[] = {\n";
|
|
print_ulseq(@tolower_data);
|
|
print "};\n\n";
|
|
}
|
|
|
|
#
|
|
# print_casefolding -- generate data for case folding
|
|
#
|
|
sub print_casefolding {
|
|
$folding->fix();
|
|
print STDERR "** case folding\n", $folding->stat() if $verbose;
|
|
|
|
print <<"END";
|
|
|
|
/*
|
|
* Case Folding
|
|
*/
|
|
|
|
END
|
|
print_bits("CASE_FOLDING", @folding_bits);
|
|
print "\n";
|
|
print $folding->cprog(NAME => "${prefix}case_folding");
|
|
|
|
print "static const unsigned long ${prefix}case_folding_seq[] = {\n";
|
|
print_ulseq(@folding_data);
|
|
print "};\n\n";
|
|
}
|
|
|
|
#
|
|
# print_casemap_context -- gerarate data for determining context
|
|
# (final/non-final)
|
|
#
|
|
sub print_casemap_context {
|
|
$casemap_ctx->fix();
|
|
print STDERR "** casemap context\n", $casemap_ctx->stat() if $verbose;
|
|
|
|
print <<"END";
|
|
|
|
/*
|
|
* Cased characters and non-spacing marks (for casemap context)
|
|
*/
|
|
|
|
END
|
|
|
|
print_bits("CASEMAP_CTX", @casemap_ctx_bits);
|
|
print <<"END";
|
|
|
|
#define CTX_CASED $LETTER_BIT
|
|
#define CTX_NSM $NSPMARK_BIT
|
|
|
|
END
|
|
print $casemap_ctx->cprog(NAME => "${prefix}casemap_ctx");
|
|
}
|
|
|
|
sub sprint_composition_hash {
|
|
my $i = 0;
|
|
my $s = '';
|
|
foreach my $r (@_) {
|
|
if ($i % 2 == 0) {
|
|
$s .= "\n" if $i != 0;
|
|
$s .= "\t";
|
|
}
|
|
$s .= sprintf "{0x%04x, 0x%04x, 0x%04x}, ", @{$r};
|
|
$i++;
|
|
}
|
|
$s;
|
|
}
|
|
|
|
sub print_bits {
|
|
my $prefix = shift;
|
|
my $i = 0;
|
|
foreach my $bit (@_) {
|
|
print "#define ${prefix}_BITS_$i\t$bit\n";
|
|
$i++;
|
|
}
|
|
}
|
|
|
|
sub print_ulseq {
|
|
my $i = 0;
|
|
foreach my $v (@_) {
|
|
if ($i % 4 == 0) {
|
|
print "\n" if $i != 0;
|
|
print "\t";
|
|
}
|
|
printf "0x%08x, ", $v;
|
|
$i++;
|
|
}
|
|
print "\n";
|
|
}
|