Contents of /code/trunk/maintain/Builducptable

Applied Bob and Daniel's patches to convert the build system to automake. Added
the maintain directory, containing files that are used for maintenance, but are
not distributed. This is an intermediate step.

1

#! /usr/bin/perl -w

2

3

# This is a Perl script to create the table of character properties. For

4

# information on the format, see ucpinternal.h. The Unicode files are

5

# expected to be in maintain/Unicode.tables/{Scripts,UnicodeData}.txt. The

6

# ucp.h file is also required. The table is written to the standard output.

7

8

# The script is rather slow because it just searches linearly through the

9

# Scripts data in order to find the script for each character or character

10

# range. It could be made faster by sorting that data, or something, but hey,

11

# it is only ever run once in a blue moon.

12

13

# Subroutine: Given a character number, return the script number. The

14

# Scripts.txt file has been read into an array, keeping just the codepoints

15

# and the script name. The lines are in one of two formats:

16

#

17

# xxxx name

18

# xxxx..yyyy name

19

#

20

# where xxxx and yyyy are code points.

21

22

sub script{

23

my($n) = $_[0];

24

foreach $line (@scriptlist)

25

{

26

my($a,$z,$s);

27

28

if ($line =~ /\.\./)

29

{

30

($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;

31

}

32

else

33

{

34

($a,$s) = $line =~ /^(\S+)\s+(.*)/;

35

$z = $a;

36

}

37

38

die "Problem on this scripts data line:\n$line"

39

if (!defined $a || !defined $z);

40

41

if ($n >= hex($a) && $n <= hex($z))

42

{

43

my($x) = $scriptnum{$s};

44

return $x if defined $x;

45

die "Can't find script number for $s\n";

46

}

47

}

48

49

# All code points not explicitly listed are "Common"

50

51

return $scriptnum{"Common"};

52

}

53

54

55

# Subroutine: given a category name, return its number

56

57

sub category {

58

my($x) = $gencat{$_[0]};

59

return $x if defined $x;

60

die "Can't find number for general category $_[0]\n";

61

}

62

63

64

# Subroutine: output an entry for a range, unless it isn't really a range,

65

# in which case just output a single entry.

66

67

sub outrange{

68

my($cp,$ncp,$gc) = @_;

69

my($flag) = ($cp != $ncp)? 0x00800000 : 0;

70

printf " { 0x%08x, 0x%08x },\n",

71

$cp | $flag | (script($cp) << 24),

72

(category($gc) << 26) | $ncp - $cp;

73

}

74

75

76

# Entry point: An argument giving the Unicode version is required.

77

78

die "Require a single argument, the Unicode version"

79

if ($#ARGV != 0);

80

$Uversion = shift @ARGV;

81

82

83

# Read in the Scripts.txt file, retaining only the code points

84

# and script names.

85

86

open(IN, "Unicode.tables/Scripts.txt") ||

87

die "Can't open Unicode.tables/Scripts.txt: $!\n";

88

89

while (<IN>)

90

{

91

next if !/^[0-9A-Z]/;

92

my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;

93

push @scriptlist, "$range $name";

94

}

95

close(IN);

96

97

98

# Now read the ucp.h file to get the values for the general categories

99

# and for the scripts.

100

101

open(IN, "ucp.h") || die "Can't open ucp.h: $!\n";

102

103

while (<IN>) { last if /^enum/; }

104

while (<IN>) { last if /^enum/; }

105

106

107

# The second enum are the general categories.

108

109

$count = 0;

110

while (<IN>)

111

{

112

last if $_ !~ /^\s+ucp_(..)/;

113

$gencat{$1} = $count++;

114

}

115

116

while (<IN>) { last if /^enum/; }

117

118

# The third enum are script names.

119

120

$count = 0;

121

while (<IN>)

122

{

123

last if $_ !~ /^\s+ucp_(\w+)/;

124

$scriptnum{$1} = $count++;

125

}

126

127

close(IN);

128

129

# Write out the initial boilerplace.

130

131

print "/* This source module is automatically generated from the Unicode\n" .

132

"property table. See ucpinternal.h for a description of the layout.\n" .

133

"This version was made from the Unicode $Uversion tables. */\n\n" .

134

"static cnode ucp_table[] = {\n";

135

136

# Now read the input file and generate the output.

137

138

open(IN, "Unicode.tables/UnicodeData.txt") ||

139

die "Can't open Unicode.tables/UnicodeData.txt: $!\n";

140

141

while (<IN>)

142

{

143

@fields = split /;/;

144

145

$cp = hex($fields[0]);

146

$gc = $fields[2];

147

$uc = $fields[12];

148

$lc = $fields[13];

149

150

# If this line has no "other case" data, it might be the start or end of

151

# a range, either one that is explicit in the data, or one that we can