mirror of
https://github.com/Luzifer/mondash.git
synced 2024-12-23 12:31:18 +00:00
450 lines
11 KiB
Go
450 lines
11 KiB
Go
// Copyright 2011 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package norm
|
|
|
|
import (
|
|
"fmt"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
|
|
// sequence of starter and non-starter runes for the purpose of normalization.
|
|
const MaxSegmentSize = maxByteBufferSize
|
|
|
|
// An Iter iterates over a string or byte slice, while normalizing it
|
|
// to a given Form.
|
|
type Iter struct {
|
|
rb reorderBuffer
|
|
buf [maxByteBufferSize]byte
|
|
info Properties // first character saved from previous iteration
|
|
next iterFunc // implementation of next depends on form
|
|
asciiF iterFunc
|
|
|
|
p int // current position in input source
|
|
multiSeg []byte // remainder of multi-segment decomposition
|
|
}
|
|
|
|
type iterFunc func(*Iter) []byte
|
|
|
|
// Init initializes i to iterate over src after normalizing it to Form f.
|
|
func (i *Iter) Init(f Form, src []byte) {
|
|
i.p = 0
|
|
if len(src) == 0 {
|
|
i.setDone()
|
|
i.rb.nsrc = 0
|
|
return
|
|
}
|
|
i.multiSeg = nil
|
|
i.rb.init(f, src)
|
|
i.next = i.rb.f.nextMain
|
|
i.asciiF = nextASCIIBytes
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
}
|
|
|
|
// InitString initializes i to iterate over src after normalizing it to Form f.
|
|
func (i *Iter) InitString(f Form, src string) {
|
|
i.p = 0
|
|
if len(src) == 0 {
|
|
i.setDone()
|
|
i.rb.nsrc = 0
|
|
return
|
|
}
|
|
i.multiSeg = nil
|
|
i.rb.initString(f, src)
|
|
i.next = i.rb.f.nextMain
|
|
i.asciiF = nextASCIIString
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
}
|
|
|
|
// Seek sets the segment to be returned by the next call to Next to start
|
|
// at position p. It is the responsibility of the caller to set p to the
|
|
// start of a UTF8 rune.
|
|
func (i *Iter) Seek(offset int64, whence int) (int64, error) {
|
|
var abs int64
|
|
switch whence {
|
|
case 0:
|
|
abs = offset
|
|
case 1:
|
|
abs = int64(i.p) + offset
|
|
case 2:
|
|
abs = int64(i.rb.nsrc) + offset
|
|
default:
|
|
return 0, fmt.Errorf("norm: invalid whence")
|
|
}
|
|
if abs < 0 {
|
|
return 0, fmt.Errorf("norm: negative position")
|
|
}
|
|
if int(abs) >= i.rb.nsrc {
|
|
i.setDone()
|
|
return int64(i.p), nil
|
|
}
|
|
i.p = int(abs)
|
|
i.multiSeg = nil
|
|
i.next = i.rb.f.nextMain
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
return abs, nil
|
|
}
|
|
|
|
// returnSlice returns a slice of the underlying input type as a byte slice.
|
|
// If the underlying is of type []byte, it will simply return a slice.
|
|
// If the underlying is of type string, it will copy the slice to the buffer
|
|
// and return that.
|
|
func (i *Iter) returnSlice(a, b int) []byte {
|
|
if i.rb.src.bytes == nil {
|
|
return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
|
|
}
|
|
return i.rb.src.bytes[a:b]
|
|
}
|
|
|
|
// Pos returns the byte position at which the next call to Next will commence processing.
|
|
func (i *Iter) Pos() int {
|
|
return i.p
|
|
}
|
|
|
|
func (i *Iter) setDone() {
|
|
i.next = nextDone
|
|
i.p = i.rb.nsrc
|
|
}
|
|
|
|
// Done returns true if there is no more input to process.
|
|
func (i *Iter) Done() bool {
|
|
return i.p >= i.rb.nsrc
|
|
}
|
|
|
|
// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
|
|
// For any input a and b for which f(a) == f(b), subsequent calls
|
|
// to Next will return the same segments.
|
|
// Modifying runes are grouped together with the preceding starter, if such a starter exists.
|
|
// Although not guaranteed, n will typically be the smallest possible n.
|
|
func (i *Iter) Next() []byte {
|
|
return i.next(i)
|
|
}
|
|
|
|
func nextASCIIBytes(i *Iter) []byte {
|
|
p := i.p + 1
|
|
if p >= i.rb.nsrc {
|
|
i.setDone()
|
|
return i.rb.src.bytes[i.p:p]
|
|
}
|
|
if i.rb.src.bytes[p] < utf8.RuneSelf {
|
|
p0 := i.p
|
|
i.p = p
|
|
return i.rb.src.bytes[p0:p]
|
|
}
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
i.next = i.rb.f.nextMain
|
|
return i.next(i)
|
|
}
|
|
|
|
func nextASCIIString(i *Iter) []byte {
|
|
p := i.p + 1
|
|
if p >= i.rb.nsrc {
|
|
i.buf[0] = i.rb.src.str[i.p]
|
|
i.setDone()
|
|
return i.buf[:1]
|
|
}
|
|
if i.rb.src.str[p] < utf8.RuneSelf {
|
|
i.buf[0] = i.rb.src.str[i.p]
|
|
i.p = p
|
|
return i.buf[:1]
|
|
}
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
i.next = i.rb.f.nextMain
|
|
return i.next(i)
|
|
}
|
|
|
|
func nextHangul(i *Iter) []byte {
|
|
p := i.p
|
|
next := p + hangulUTF8Size
|
|
if next >= i.rb.nsrc {
|
|
i.setDone()
|
|
} else if i.rb.src.hangul(next) == 0 {
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
i.next = i.rb.f.nextMain
|
|
return i.next(i)
|
|
}
|
|
i.p = next
|
|
return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
|
|
}
|
|
|
|
func nextDone(i *Iter) []byte {
|
|
return nil
|
|
}
|
|
|
|
// nextMulti is used for iterating over multi-segment decompositions
|
|
// for decomposing normal forms.
|
|
func nextMulti(i *Iter) []byte {
|
|
j := 0
|
|
d := i.multiSeg
|
|
// skip first rune
|
|
for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
|
|
}
|
|
for j < len(d) {
|
|
info := i.rb.f.info(input{bytes: d}, j)
|
|
if info.BoundaryBefore() {
|
|
i.multiSeg = d[j:]
|
|
return d[:j]
|
|
}
|
|
j += int(info.size)
|
|
}
|
|
// treat last segment as normal decomposition
|
|
i.next = i.rb.f.nextMain
|
|
return i.next(i)
|
|
}
|
|
|
|
// nextMultiNorm is used for iterating over multi-segment decompositions
|
|
// for composing normal forms.
|
|
func nextMultiNorm(i *Iter) []byte {
|
|
j := 0
|
|
d := i.multiSeg
|
|
for j < len(d) {
|
|
info := i.rb.f.info(input{bytes: d}, j)
|
|
if info.BoundaryBefore() {
|
|
i.rb.compose()
|
|
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
|
|
i.rb.ss.first(info)
|
|
i.rb.insertUnsafe(input{bytes: d}, j, info)
|
|
i.multiSeg = d[j+int(info.size):]
|
|
return seg
|
|
}
|
|
i.rb.ss.next(info)
|
|
i.rb.insertUnsafe(input{bytes: d}, j, info)
|
|
j += int(info.size)
|
|
}
|
|
i.multiSeg = nil
|
|
i.next = nextComposed
|
|
return doNormComposed(i)
|
|
}
|
|
|
|
// nextDecomposed is the implementation of Next for forms NFD and NFKD.
|
|
func nextDecomposed(i *Iter) (next []byte) {
|
|
outp := 0
|
|
inCopyStart, outCopyStart := i.p, 0
|
|
ss := mkStreamSafe(i.info)
|
|
for {
|
|
if sz := int(i.info.size); sz <= 1 {
|
|
p := i.p
|
|
i.p++ // ASCII or illegal byte. Either way, advance by 1.
|
|
if i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
return i.returnSlice(p, i.p)
|
|
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
|
|
i.next = i.asciiF
|
|
return i.returnSlice(p, i.p)
|
|
}
|
|
outp++
|
|
} else if d := i.info.Decomposition(); d != nil {
|
|
// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
|
|
// Case 1: there is a leftover to copy. In this case the decomposition
|
|
// must begin with a modifier and should always be appended.
|
|
// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
|
|
p := outp + len(d)
|
|
if outp > 0 {
|
|
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
|
if p > len(i.buf) {
|
|
return i.buf[:outp]
|
|
}
|
|
} else if i.info.multiSegment() {
|
|
// outp must be 0 as multi-segment decompositions always
|
|
// start a new segment.
|
|
if i.multiSeg == nil {
|
|
i.multiSeg = d
|
|
i.next = nextMulti
|
|
return nextMulti(i)
|
|
}
|
|
// We are in the last segment. Treat as normal decomposition.
|
|
d = i.multiSeg
|
|
i.multiSeg = nil
|
|
p = len(d)
|
|
}
|
|
prevCC := i.info.tccc
|
|
if i.p += sz; i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
i.info = Properties{} // Force BoundaryBefore to succeed.
|
|
} else {
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
}
|
|
switch ss.next(i.info) {
|
|
case ssOverflow:
|
|
i.next = nextCGJDecompose
|
|
fallthrough
|
|
case ssStarter:
|
|
if outp > 0 {
|
|
copy(i.buf[outp:], d)
|
|
return i.buf[:p]
|
|
}
|
|
return d
|
|
}
|
|
copy(i.buf[outp:], d)
|
|
outp = p
|
|
inCopyStart, outCopyStart = i.p, outp
|
|
if i.info.ccc < prevCC {
|
|
goto doNorm
|
|
}
|
|
continue
|
|
} else if r := i.rb.src.hangul(i.p); r != 0 {
|
|
outp = decomposeHangul(i.buf[:], r)
|
|
i.p += hangulUTF8Size
|
|
inCopyStart, outCopyStart = i.p, outp
|
|
if i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
break
|
|
} else if i.rb.src.hangul(i.p) != 0 {
|
|
i.next = nextHangul
|
|
return i.buf[:outp]
|
|
}
|
|
} else {
|
|
p := outp + sz
|
|
if p > len(i.buf) {
|
|
break
|
|
}
|
|
outp = p
|
|
i.p += sz
|
|
}
|
|
if i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
break
|
|
}
|
|
prevCC := i.info.tccc
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
if v := ss.next(i.info); v == ssStarter {
|
|
break
|
|
} else if v == ssOverflow {
|
|
i.next = nextCGJDecompose
|
|
break
|
|
}
|
|
if i.info.ccc < prevCC {
|
|
goto doNorm
|
|
}
|
|
}
|
|
if outCopyStart == 0 {
|
|
return i.returnSlice(inCopyStart, i.p)
|
|
} else if inCopyStart < i.p {
|
|
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
|
}
|
|
return i.buf[:outp]
|
|
doNorm:
|
|
// Insert what we have decomposed so far in the reorderBuffer.
|
|
// As we will only reorder, there will always be enough room.
|
|
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
|
i.rb.insertDecomposed(i.buf[0:outp])
|
|
return doNormDecomposed(i)
|
|
}
|
|
|
|
func doNormDecomposed(i *Iter) []byte {
|
|
for {
|
|
if s := i.rb.ss.next(i.info); s == ssOverflow {
|
|
i.next = nextCGJDecompose
|
|
break
|
|
}
|
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
|
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
break
|
|
}
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
if i.info.ccc == 0 {
|
|
break
|
|
}
|
|
}
|
|
// new segment or too many combining characters: exit normalization
|
|
return i.buf[:i.rb.flushCopy(i.buf[:])]
|
|
}
|
|
|
|
func nextCGJDecompose(i *Iter) []byte {
|
|
i.rb.ss = 0
|
|
i.rb.insertCGJ()
|
|
i.next = nextDecomposed
|
|
buf := doNormDecomposed(i)
|
|
return buf
|
|
}
|
|
|
|
// nextComposed is the implementation of Next for forms NFC and NFKC.
|
|
func nextComposed(i *Iter) []byte {
|
|
outp, startp := 0, i.p
|
|
var prevCC uint8
|
|
ss := mkStreamSafe(i.info)
|
|
for {
|
|
if !i.info.isYesC() {
|
|
goto doNorm
|
|
}
|
|
prevCC = i.info.tccc
|
|
sz := int(i.info.size)
|
|
if sz == 0 {
|
|
sz = 1 // illegal rune: copy byte-by-byte
|
|
}
|
|
p := outp + sz
|
|
if p > len(i.buf) {
|
|
break
|
|
}
|
|
outp = p
|
|
i.p += sz
|
|
if i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
break
|
|
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
|
|
i.next = i.asciiF
|
|
break
|
|
}
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
if v := ss.next(i.info); v == ssStarter {
|
|
break
|
|
} else if v == ssOverflow {
|
|
i.next = nextCGJCompose
|
|
break
|
|
}
|
|
if i.info.ccc < prevCC {
|
|
goto doNorm
|
|
}
|
|
}
|
|
return i.returnSlice(startp, i.p)
|
|
doNorm:
|
|
i.p = startp
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
if i.info.multiSegment() {
|
|
d := i.info.Decomposition()
|
|
info := i.rb.f.info(input{bytes: d}, 0)
|
|
i.rb.insertUnsafe(input{bytes: d}, 0, info)
|
|
i.multiSeg = d[int(info.size):]
|
|
i.next = nextMultiNorm
|
|
return nextMultiNorm(i)
|
|
}
|
|
i.rb.ss.first(i.info)
|
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
|
return doNormComposed(i)
|
|
}
|
|
|
|
func doNormComposed(i *Iter) []byte {
|
|
// First rune should already be inserted.
|
|
for {
|
|
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
break
|
|
}
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
if s := i.rb.ss.next(i.info); s == ssStarter {
|
|
break
|
|
} else if s == ssOverflow {
|
|
i.next = nextCGJCompose
|
|
break
|
|
}
|
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
|
}
|
|
i.rb.compose()
|
|
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
|
|
return seg
|
|
}
|
|
|
|
func nextCGJCompose(i *Iter) []byte {
|
|
i.rb.ss = 0 // instead of first
|
|
i.rb.insertCGJ()
|
|
i.next = nextComposed
|
|
// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
|
|
// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
|
|
// If we ever change that, insert a check here.
|
|
i.rb.ss.first(i.info)
|
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
|
return doNormComposed(i)
|
|
}
|