.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.14
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sh \" Subsection heading
.br
.if t .Sp
.ne 5
.PP
\fB\\$1\fR
.PP
..
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings. \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote. | will give a
.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
.\" expand to `' in nroff, nothing in troff, for use with C<>.
.tr \(*W-|\(bv\*(Tr
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
. ds -- \(*W-
. ds PI pi
. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
. ds L" ""
. ds R" ""
. ds C` ""
. ds C' ""
'br\}
.el\{\
. ds -- \|\(em\|
. ds PI \(*p
. ds L" ``
. ds R" ''
'br\}
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
.\" entries marked with X<> in POD. Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.if \nF \{\
. de IX
. tm Index:\\$1\t\\n%\t"\\$2"
..
. nr % 0
. rr F
.\}
.\"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.hy 0
.if n .na
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear. Run. Save yourself. No user-serviceable parts.
. \" fudge factors for nroff and troff
.if n \{\
. ds #H 0
. ds #V .8m
. ds #F .3m
. ds #[ \f1
. ds #] \fP
.\}
.if t \{\
. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
. ds #V .6m
. ds #F 0
. ds #[ \&
. ds #] \&
.\}
. \" simple accents for nroff and troff
.if n \{\
. ds ' \&
. ds ` \&
. ds ^ \&
. ds , \&
. ds ~ ~
. ds /
.\}
.if t \{\
. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
. \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
. \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
. \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
. ds : e
. ds 8 ss
. ds o a
. ds d- d\h'-1'\(ga
. ds D- D\h'-1'\(hy
. ds th \o'bp'
. ds Th \o'LP'
. ds ae ae
. ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "bt_split_names 3"
.TH bt_split_names 3 "2003-10-25" "btparse, version 0.34" "btparse"
.SH "NAME"
bt_split_names \- splitting up BibTeX names and lists of names
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 11
\& bt_stringlist * bt_split_list (char * string,
\& char * delim,
\& char * filename,
\& int line,
\& char * description);
\& void bt_free_list (bt_stringlist *list);
\& bt_name * bt_split_name (char * name,
\& char * filename,
\& int line,
\& int name_num);
\& void bt_free_name (bt_name * name);
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
When BibTeX files are used for their original purpose\-\-\-bibliographic
entries describing scholarly publications\-\-\-processing lists of names
(authors and editors mostly) becomes important. Although such
name-processing is outside the general-purpose database domain of most
of the \fBbtparse\fR library, these splitting functions are provided as a
concession to reality: most BibTeX data files use the BibTeX conventions
for author names, and a library to process that data ought to be capable
of processing the names.
.PP
Name-processing comes in two stages: first, split up a list of names
into individual strings; second, split up each name into \*(L"parts\*(R" (first,
von, last, and jr). The first is actually quite general: you could pick
a delimiter (such as \f(CW'and'\fR, used for lists of names) and use it to
divide any string into substrings. \f(CW\*(C`bt_split_list()\*(C'\fR could then be
called to break up the original string and extract the substrings.
\&\f(CW\*(C`bt_split_name()\*(C'\fR, however, is quite specific to four-part author names
written using BibTeX conventions. (These conventions are described
informally in any BibTeX documentation; the description you will find
here is more formal and algorithmic\-\-\-and thus harder to understand.)
.PP
See bt_format_names for information on turning split-up names back
into strings in a variety of ways.
.SH "FUNCTIONS"
.IX Header "FUNCTIONS"
.IP "\fIbt_split_list()\fR" 4
.IX Item "bt_split_list()"
.Vb 5
\& bt_stringlist * bt_split_list (char * string,
\& char * delim,
\& char * filename,
\& int line,
\& char * description)
.Ve
.Sp
Splits \f(CW\*(C`string\*(C'\fR into substrings delimited by \f(CW\*(C`delim\*(C'\fR (a fixed string).
The splitting is done according to the rules used by BibTeX for
splitting up a list of names, in particular:
.RS 4
.IP "*" 4
delimiters at beginning or end of string are ignored
.IP "*" 4
delimiters must be surrounded by whitespace
.IP "*" 4
matching of delimiters is case insensitive
.IP "*" 4
delimiters at non-zero brace depth are ignored
.RE
.RS 4
.Sp
For instance, if the delimiter is \f(CW"and"\fR, then the string
.Sp
.Vb 1
\& Candy and Apples AnD {Green Eggs and Ham}
.Ve
.Sp
splits into three substrings: \f(CW"Candy"\fR, \f(CW"Apples"\fR, and
\&\f(CW"{Green Eggs and Ham}"\fR.
.Sp
If there are extra delimiters at the extremities of the string\-\-\-say,
an \f(CW"and"\fR at the beginning of the string\-\-\-then they are included in
the first/last string; no warning is currently printed, but this may
change. Successive delimiters (\f(CW"and and"\fR) result in a warning and a
\&\s-1NULL\s0 string being added to the list of substrings. For instance, the
string
.Sp
.Vb 1
\& and Joe Q. Blow and and Smith, Jr., John
.Ve
.Sp
would split into three substrings: \f(CW"and Joe Q. Blow"\fR, \f(CW\*(C`NULL\*(C'\fR, and
\&\f(CW"Smith, Jr., John"\fR.
.Sp
(If these rules seem somewhat odd, don't blame me: I just implemented
BibTeX's observed behaviour and added warning messages for one of the
more obvious and easily-detected mistakes.)
.Sp
The substrings are returned as a \f(CW\*(C`bt_stringlist\*(C'\fR structure:
.Sp
.Vb 6
\& typedef struct
\& {
\& char * string;
\& int num_items;
\& char ** items;
\& } bt_stringlist;
.Ve
.Sp
There is currently no elegant interface to this structure: you just have
to poke around in it yourself. The fields are:
.ie n .IP """string""" 4
.el .IP "\f(CWstring\fR" 4
.IX Item "string"
a copy of the \f(CW\*(C`string\*(C'\fR parameter passed to \f(CW\*(C`bt_split_list()\*(C'\fR, but with
\&\s-1NUL\s0 characters replacing the space after each substring. (This is safe
because delimiters must be surrounded by whitespace, which means that
each substring is followed by whitespace which is not part of the
substring.) You probably shouldn't fiddle with \f(CW\*(C`string\*(C'\fR; it's just
there so that \f(CW\*(C`bt_free_list()\*(C'\fR has something to \f(CW\*(C`free()\*(C'\fR.
.ie n .IP """num_items""" 4
.el .IP "\f(CWnum_items\fR" 4
.IX Item "num_items"
the number of substrings found in the string passed to
\&\f(CW\*(C`bt_split_list()\*(C'\fR.
.ie n .IP """items""" 4
.el .IP "\f(CWitems\fR" 4
.IX Item "items"
an array of \f(CW\*(C`num_items\*(C'\fR pointers into \f(CW\*(C`string\*(C'\fR. For instance,
\&\f(CW\*(C`items[1]\*(C'\fR points to the second substring. Since \f(CW\*(C`string\*(C'\fR has been
mangled with \s-1NUL\s0 characters, it is safe to treat \f(CW\*(C`items[i]\*(C'\fR as a
regular C string.
.Sp
\&\f(CW\*(C`filename\*(C'\fR, \f(CW\*(C`line\*(C'\fR, and \f(CW\*(C`description\*(C'\fR are all used for generating
warning messages. \f(CW\*(C`filename\*(C'\fR and \f(CW\*(C`line\*(C'\fR simply describe where the
string came from, and \f(CW\*(C`description\*(C'\fR is a brief (one word) description
of the substrings. For instance, if you are splitting a list of names,
supply \f(CW"name"\fR for \f(CW\*(C`description\*(C'\fR\-\-\-that way, warnings will refer to
\&\*(L"name X\*(R" rather than \*(L"substring x\*(R".
.RE
.RS 4
.RE
.IP "\fIbt_free_list()\fR" 4
.IX Item "bt_free_list()"
.Vb 1
\& void bt_free_list (bt_stringlist *list)
.Ve
.Sp
Frees a \f(CW\*(C`bt_stringlist\*(C'\fR structure as returned by \f(CW\*(C`bt_split_list()\*(C'\fR.
That is, it frees the copy of the string you passed to
\&\f(CW\*(C`bt_split_list()\*(C'\fR, and then frees the structure itself.
.IP "\fIbt_split_name()\fR" 4
.IX Item "bt_split_name()"
.Vb 4
\& bt_name * bt_split_name (char * name,
\& char * filename,
\& int line,
\& int name_num)
.Ve
.Sp
Splits a single BibTeX-style author name into four parts: first, von,
last, and jr. This can handle almost all names in the style of the
major Western European languages, but not quite. (Alas!)
.Sp
A name is split by first dividing into tokens; tokens are separated by
whitespace or commas at brace-level zero. Thus the name
.Sp
.Vb 1
\& van der Graaf, Horace Q.
.Ve
.Sp
has five tokens, whereas the name
.Sp
.Vb 1
\& {Foo, Bar, and Sons}
.Ve
.Sp
consists of a single token.
.Sp
How tokens are divided into parts depends on the form of the name. If
the name has no commas at brace-level zero (as in the second example),
then it is assumed to be in either \*(L"first last\*(R" or \*(L"first von last\*(R"
form. If there are no tokens that start with a lower-case letter, then
\&\*(L"first last\*(R" form is assumed: the final token is the last name, and all
other tokens form the first name. Otherwise, the earliest contiguous
sequence of tokens with initial lower-case letters is taken as the `von'
part; if this sequence includes the final token, then a warning is
printed and the final token is forced to be the `last' part.
.Sp
If a name has a single comma, then it is assumed to be in \*(L"von last,
first\*(R" form. A leading sequence of tokens with initial lower-case
letters, if any, forms the `von' part; tokens between the `von' and the
comma form the `last' part; tokens following the comma form the `first'
part. Again, if there are no token following a leading sequence of
lowercase tokens, a warning is printed and the token immediately
preceding the comma is taken to be the `last' part.
.Sp
If a name has more than two commas, a warning is printed and the name is
treated as though only the first two commas were present.
.Sp
Finally, if a name has two commas, it is assumed to be in \*(L"von last, jr,
first\*(R" form. (This is the only way to represent a name with a `jr'
part.) The parsing of the name is the same as for a one-comma name,
except that tokens between the two commas are taken to be the `jr' part.
.Sp
The one case not properly handled by BibTeX name conventions is a name
with a 'jr' part not separated from the last name by a comma; for
example:
.Sp
.Vb 2
\& Henry Ford Jr.
\& George Herbert Walker Bush III
.Ve
.Sp
Both of these would be incorrectly interpreted by both BibTeX and
\&\fIbt_split_name()\fR: the \f(CW"Jr."\fR or \f(CW"III"\fR token would be taken as the
last name, and the other tokekens as a two\- or four-part first name.
The workaround is to shoehorn the 'jr' into the last name:
.Sp
.Vb 2
\& Henry {Ford Jr.}
\& George Herbert Walker {Bush III}
.Ve
.Sp
but this will make it impossible to extract the last name on its own,
e.g. to generate \*(L"author\-year\*(R" style citations. This design flaw may be
fixed in a future version of \fBbtparse\fR.
.Sp
The split-up name is returned as a \f(CW\*(C`bt_name\*(C'\fR structure:
.Sp
.Vb 6
\& typedef struct
\& {
\& bt_stringlist * tokens;
\& char ** parts[BT_MAX_NAMEPARTS];
\& int part_len[BT_MAX_NAMEPARTS];
\& } bt_name;
.Ve
.Sp
Again, there's no nice interface to this structure; you'll just have to
access the fields individually. They are:
.RS 4
.ie n .IP """tokens""" 4
.el .IP "\f(CWtokens\fR" 4
.IX Item "tokens"
the name, broken down into a flat list of tokens. See above for a
description of the \f(CW\*(C`bt_stringlist\*(C'\fR structure.
.ie n .IP """parts""" 4
.el .IP "\f(CWparts\fR" 4
.IX Item "parts"
an array of arrays of pointers into the token list. The major dimension
of this beast is the \*(L"name part;\*(R" you should index this dimension using
the \f(CW\*(C`bt_namepart\*(C'\fR enum. For instance, \f(CW\*(C`parts[BTN_LAST]\*(C'\fR is an array
of pointers to the tokens comprising the last name;
\&\f(CW\*(C`parts[BTN_LAST][1]\*(C'\fR is a \f(CW\*(C`char *\*(C'\fR: the second token of the 'last'
part; and \f(CW\*(C`parts[BTN_LAST][1][0]\*(C'\fR is the first character of the second
token of the 'last' part.
.ie n .IP """part_len""" 4
.el .IP "\f(CWpart_len\fR" 4
.IX Item "part_len"
the length, in tokens, of each part. For instance, you might loop over
all tokens in the 'first' part as follows (assuming \f(CW\*(C`name\*(C'\fR is a
\&\f(CW\*(C`bt_name *\*(C'\fR returned by \f(CW\*(C`bt_split_name()\*(C'\fR):
.Sp
.Vb 5
\& for (i = 0; i < name->part_len[BTN_FIRST]; i++)
\& {
\& printf ("token %d of first name: %s\en",
\& i, name->parts[BTN_FIRST][i]);
\& }
.Ve
.RE
.RS 4
.RE
.IP "\fIbt_free_name()\fR" 4
.IX Item "bt_free_name()"
.Vb 1
\& void bt_free_name (bt_name * name)
.Ve
.Sp
Frees the \f(CW\*(C`bt_name\*(C'\fR structure created by \f(CW\*(C`bt_split_name()\*(C'\fR (including
the \f(CW\*(C`bt_stringlist\*(C'\fR structure inside the \f(CW\*(C`bt_name\*(C'\fR).
.SH "SEE ALSO"
.IX Header "SEE ALSO"
btparse, bt_format_names
.SH "AUTHOR"
.IX Header "AUTHOR"
Greg Ward