{-# LANGUAGE BangPatterns, RecordWildCards #-}-- |-- Module : Data.Text.ICU.Break.Pure-- Copyright : (c) 2010 Bryan O'Sullivan---- License : BSD-style-- Maintainer : bos@serpentine.com-- Stability : experimental-- Portability : GHC---- String breaking functions for Unicode, implemented as bindings to-- the International Components for Unicode (ICU) libraries.---- The text boundary positions are found according to the rules described in-- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex-- #14, Line Breaking Properties. These are available at-- <http://www.unicode.org/reports/tr14/> and-- <http://www.unicode.org/reports/tr29/>.moduleData.Text.ICU.Break.Pure(-- * TypesBreaker,Break,brkPrefix,brkBreak,brkSuffix,brkStatus,Line(..),Word(..)-- * Breaking functions,breakCharacter,breakLine,breakSentence,breakWord-- * Iteration,breaks,breaksRight)whereimportData.Text(Text,empty)importData.Text.Foreign(dropWord16,takeWord16)importData.Text.ICU.Break(Line,Word)importData.Text.ICU.Break.Types(BreakIterator(..))importData.Text.ICU.Internal(LocaleName)importSystem.IO.Unsafe(unsafeInterleaveIO,unsafePerformIO)importqualifiedData.Text.ICU.BreakasIO-- | A boundary analyser.newtypeBreakera=B(BreakIteratora)new::(LocaleName->Text->IO(BreakIteratora))->LocaleName->Breakeranewactloc=unsafePerformIO$B`fmap`actlocempty-- | Break a string on character boundaries.---- Character boundary analysis identifies the boundaries of "Extended-- Grapheme Clusters", which are groupings of codepoints that should be-- treated as character-like units for many text operations. Please see-- Unicode Standard Annex #29, Unicode Text Segmentation,-- <http://www.unicode.org/reports/tr29/> for additional information on-- grapheme clusters and guidelines on their use.breakCharacter::LocaleName->Breaker()breakCharacter=newIO.breakCharacter-- | Break a string on line boundaries.---- Line boundary analysis determines where a text string can be broken when-- line wrapping. The mechanism correctly handles punctuation and hyphenated-- words.breakLine::LocaleName->BreakerLinebreakLine=newIO.breakLine-- | Break a string on sentence boundaries.---- Sentence boundary analysis allows selection with correct interpretation-- of periods within numbers and abbreviations, and trailing punctuation-- marks such as quotation marks and parentheses.breakSentence::LocaleName->Breaker()breakSentence=newIO.breakSentence-- | Break a string on word boundaries.---- Word boundary analysis is used by search and replace functions, as well-- as within text editing applications that allow the user to select words-- with a double click. Word selection provides correct interpretation of-- punctuation marks within and following words. Characters that are not-- part of a word, such as symbols or punctuation marks, have word breaks on-- both sides.breakWord::LocaleName->BreakerWordbreakWord=newIO.breakWord-- | A break in a string.dataBreaka=Break{brkPrefix::{-# UNPACK #-}!Text-- ^ Prefix of the current break.,brkBreak::{-# UNPACK #-}!Text-- ^ Text of the current break.,brkSuffix::{-# UNPACK #-}!Text-- ^ Suffix of the current break.,brkStatus::!a-- ^ Status of the current break (only meaningful if 'Line' or 'Word').}deriving(Eq,Show)-- | Return a list of all breaks in a string, from left to right.breaks::Breakera->Text->[Breaka]breaks(Bb)t=unsafePerformIO$dobi<-IO.clonebIO.setTextbitletgop=domix<-IO.nextbicasemixofNothing->return[]Justn->dos<-IO.getStatusbiletd=n-pu=dropWord16pt(Break(takeWord16pt)(takeWord16du)(dropWord16du)s:)`fmap`gonunsafeInterleaveIO$go=<<IO.firstbi-- | Return a list of all breaks in a string, from right to left.breaksRight::Breakera->Text->[Breaka]breaksRight(Bb)t=unsafePerformIO$dobi<-IO.clonebIO.setTextbitletgop=domix<-IO.previousbicasemixofNothing->return[]Justn->dos<-IO.getStatusbiletd=p-nu=dropWord16nt(Break(takeWord16nt)(takeWord16du)(dropWord16du)s:)`fmap`gonunsafeInterleaveIO$go=<<IO.lastbi