mirror of https://github.com/docker/buildx.git
vendor golang.org/x/text dependency
Signed-off-by: Talon Bowler <talon.bowler@docker.com>
This commit is contained in:
parent
9fdc99dc76
commit
d342cb9d03
|
@ -0,0 +1,162 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run gen.go gen_trieval.go
|
||||
|
||||
// Package cases provides general and language-specific case mappers.
|
||||
package cases // import "golang.org/x/text/cases"
|
||||
|
||||
import (
|
||||
"golang.org/x/text/language"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// References:
|
||||
// - Unicode Reference Manual Chapter 3.13, 4.2, and 5.18.
|
||||
// - https://www.unicode.org/reports/tr29/
|
||||
// - https://www.unicode.org/Public/6.3.0/ucd/CaseFolding.txt
|
||||
// - https://www.unicode.org/Public/6.3.0/ucd/SpecialCasing.txt
|
||||
// - https://www.unicode.org/Public/6.3.0/ucd/DerivedCoreProperties.txt
|
||||
// - https://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakProperty.txt
|
||||
// - https://www.unicode.org/Public/6.3.0/ucd/auxiliary/WordBreakTest.txt
|
||||
// - http://userguide.icu-project.org/transforms/casemappings
|
||||
|
||||
// TODO:
|
||||
// - Case folding
|
||||
// - Wide and Narrow?
|
||||
// - Segmenter option for title casing.
|
||||
// - ASCII fast paths
|
||||
// - Encode Soft-Dotted property within trie somehow.
|
||||
|
||||
// A Caser transforms given input to a certain case. It implements
|
||||
// transform.Transformer.
|
||||
//
|
||||
// A Caser may be stateful and should therefore not be shared between
|
||||
// goroutines.
|
||||
type Caser struct {
|
||||
t transform.SpanningTransformer
|
||||
}
|
||||
|
||||
// Bytes returns a new byte slice with the result of converting b to the case
|
||||
// form implemented by c.
|
||||
func (c Caser) Bytes(b []byte) []byte {
|
||||
b, _, _ = transform.Bytes(c.t, b)
|
||||
return b
|
||||
}
|
||||
|
||||
// String returns a string with the result of transforming s to the case form
|
||||
// implemented by c.
|
||||
func (c Caser) String(s string) string {
|
||||
s, _, _ = transform.String(c.t, s)
|
||||
return s
|
||||
}
|
||||
|
||||
// Reset resets the Caser to be reused for new input after a previous call to
|
||||
// Transform.
|
||||
func (c Caser) Reset() { c.t.Reset() }
|
||||
|
||||
// Transform implements the transform.Transformer interface and transforms the
|
||||
// given input to the case form implemented by c.
|
||||
func (c Caser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
return c.t.Transform(dst, src, atEOF)
|
||||
}
|
||||
|
||||
// Span implements the transform.SpanningTransformer interface.
|
||||
func (c Caser) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
return c.t.Span(src, atEOF)
|
||||
}
|
||||
|
||||
// Upper returns a Caser for language-specific uppercasing.
|
||||
func Upper(t language.Tag, opts ...Option) Caser {
|
||||
return Caser{makeUpper(t, getOpts(opts...))}
|
||||
}
|
||||
|
||||
// Lower returns a Caser for language-specific lowercasing.
|
||||
func Lower(t language.Tag, opts ...Option) Caser {
|
||||
return Caser{makeLower(t, getOpts(opts...))}
|
||||
}
|
||||
|
||||
// Title returns a Caser for language-specific title casing. It uses an
|
||||
// approximation of the default Unicode Word Break algorithm.
|
||||
func Title(t language.Tag, opts ...Option) Caser {
|
||||
return Caser{makeTitle(t, getOpts(opts...))}
|
||||
}
|
||||
|
||||
// Fold returns a Caser that implements Unicode case folding. The returned Caser
|
||||
// is stateless and safe to use concurrently by multiple goroutines.
|
||||
//
|
||||
// Case folding does not normalize the input and may not preserve a normal form.
|
||||
// Use the collate or search package for more convenient and linguistically
|
||||
// sound comparisons. Use golang.org/x/text/secure/precis for string comparisons
|
||||
// where security aspects are a concern.
|
||||
func Fold(opts ...Option) Caser {
|
||||
return Caser{makeFold(getOpts(opts...))}
|
||||
}
|
||||
|
||||
// An Option is used to modify the behavior of a Caser.
|
||||
type Option func(o options) options
|
||||
|
||||
// TODO: consider these options to take a boolean as well, like FinalSigma.
|
||||
// The advantage of using this approach is that other providers of a lower-case
|
||||
// algorithm could set different defaults by prefixing a user-provided slice
|
||||
// of options with their own. This is handy, for instance, for the precis
|
||||
// package which would override the default to not handle the Greek final sigma.
|
||||
|
||||
var (
|
||||
// NoLower disables the lowercasing of non-leading letters for a title
|
||||
// caser.
|
||||
NoLower Option = noLower
|
||||
|
||||
// Compact omits mappings in case folding for characters that would grow the
|
||||
// input. (Unimplemented.)
|
||||
Compact Option = compact
|
||||
)
|
||||
|
||||
// TODO: option to preserve a normal form, if applicable?
|
||||
|
||||
type options struct {
|
||||
noLower bool
|
||||
simple bool
|
||||
|
||||
// TODO: segmenter, max ignorable, alternative versions, etc.
|
||||
|
||||
ignoreFinalSigma bool
|
||||
}
|
||||
|
||||
func getOpts(o ...Option) (res options) {
|
||||
for _, f := range o {
|
||||
res = f(res)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func noLower(o options) options {
|
||||
o.noLower = true
|
||||
return o
|
||||
}
|
||||
|
||||
func compact(o options) options {
|
||||
o.simple = true
|
||||
return o
|
||||
}
|
||||
|
||||
// HandleFinalSigma specifies whether the special handling of Greek final sigma
|
||||
// should be enabled. Unicode prescribes handling the Greek final sigma for all
|
||||
// locales, but standards like IDNA and PRECIS override this default.
|
||||
func HandleFinalSigma(enable bool) Option {
|
||||
if enable {
|
||||
return handleFinalSigma
|
||||
}
|
||||
return ignoreFinalSigma
|
||||
}
|
||||
|
||||
func ignoreFinalSigma(o options) options {
|
||||
o.ignoreFinalSigma = true
|
||||
return o
|
||||
}
|
||||
|
||||
func handleFinalSigma(o options) options {
|
||||
o.ignoreFinalSigma = false
|
||||
return o
|
||||
}
|
|
@ -0,0 +1,376 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cases
|
||||
|
||||
import "golang.org/x/text/transform"
|
||||
|
||||
// A context is used for iterating over source bytes, fetching case info and
|
||||
// writing to a destination buffer.
|
||||
//
|
||||
// Casing operations may need more than one rune of context to decide how a rune
|
||||
// should be cased. Casing implementations should call checkpoint on context
|
||||
// whenever it is known to be safe to return the runes processed so far.
|
||||
//
|
||||
// It is recommended for implementations to not allow for more than 30 case
|
||||
// ignorables as lookahead (analogous to the limit in norm) and to use state if
|
||||
// unbounded lookahead is needed for cased runes.
|
||||
type context struct {
|
||||
dst, src []byte
|
||||
atEOF bool
|
||||
|
||||
pDst int // pDst points past the last written rune in dst.
|
||||
pSrc int // pSrc points to the start of the currently scanned rune.
|
||||
|
||||
// checkpoints safe to return in Transform, where nDst <= pDst and nSrc <= pSrc.
|
||||
nDst, nSrc int
|
||||
err error
|
||||
|
||||
sz int // size of current rune
|
||||
info info // case information of currently scanned rune
|
||||
|
||||
// State preserved across calls to Transform.
|
||||
isMidWord bool // false if next cased letter needs to be title-cased.
|
||||
}
|
||||
|
||||
func (c *context) Reset() {
|
||||
c.isMidWord = false
|
||||
}
|
||||
|
||||
// ret returns the return values for the Transform method. It checks whether
|
||||
// there were insufficient bytes in src to complete and introduces an error
|
||||
// accordingly, if necessary.
|
||||
func (c *context) ret() (nDst, nSrc int, err error) {
|
||||
if c.err != nil || c.nSrc == len(c.src) {
|
||||
return c.nDst, c.nSrc, c.err
|
||||
}
|
||||
// This point is only reached by mappers if there was no short destination
|
||||
// buffer. This means that the source buffer was exhausted and that c.sz was
|
||||
// set to 0 by next.
|
||||
if c.atEOF && c.pSrc == len(c.src) {
|
||||
return c.pDst, c.pSrc, nil
|
||||
}
|
||||
return c.nDst, c.nSrc, transform.ErrShortSrc
|
||||
}
|
||||
|
||||
// retSpan returns the return values for the Span method. It checks whether
|
||||
// there were insufficient bytes in src to complete and introduces an error
|
||||
// accordingly, if necessary.
|
||||
func (c *context) retSpan() (n int, err error) {
|
||||
_, nSrc, err := c.ret()
|
||||
return nSrc, err
|
||||
}
|
||||
|
||||
// checkpoint sets the return value buffer points for Transform to the current
|
||||
// positions.
|
||||
func (c *context) checkpoint() {
|
||||
if c.err == nil {
|
||||
c.nDst, c.nSrc = c.pDst, c.pSrc+c.sz
|
||||
}
|
||||
}
|
||||
|
||||
// unreadRune causes the last rune read by next to be reread on the next
|
||||
// invocation of next. Only one unreadRune may be called after a call to next.
|
||||
func (c *context) unreadRune() {
|
||||
c.sz = 0
|
||||
}
|
||||
|
||||
func (c *context) next() bool {
|
||||
c.pSrc += c.sz
|
||||
if c.pSrc == len(c.src) || c.err != nil {
|
||||
c.info, c.sz = 0, 0
|
||||
return false
|
||||
}
|
||||
v, sz := trie.lookup(c.src[c.pSrc:])
|
||||
c.info, c.sz = info(v), sz
|
||||
if c.sz == 0 {
|
||||
if c.atEOF {
|
||||
// A zero size means we have an incomplete rune. If we are atEOF,
|
||||
// this means it is an illegal rune, which we will consume one
|
||||
// byte at a time.
|
||||
c.sz = 1
|
||||
} else {
|
||||
c.err = transform.ErrShortSrc
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// writeBytes adds bytes to dst.
|
||||
func (c *context) writeBytes(b []byte) bool {
|
||||
if len(c.dst)-c.pDst < len(b) {
|
||||
c.err = transform.ErrShortDst
|
||||
return false
|
||||
}
|
||||
// This loop is faster than using copy.
|
||||
for _, ch := range b {
|
||||
c.dst[c.pDst] = ch
|
||||
c.pDst++
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// writeString writes the given string to dst.
|
||||
func (c *context) writeString(s string) bool {
|
||||
if len(c.dst)-c.pDst < len(s) {
|
||||
c.err = transform.ErrShortDst
|
||||
return false
|
||||
}
|
||||
// This loop is faster than using copy.
|
||||
for i := 0; i < len(s); i++ {
|
||||
c.dst[c.pDst] = s[i]
|
||||
c.pDst++
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// copy writes the current rune to dst.
|
||||
func (c *context) copy() bool {
|
||||
return c.writeBytes(c.src[c.pSrc : c.pSrc+c.sz])
|
||||
}
|
||||
|
||||
// copyXOR copies the current rune to dst and modifies it by applying the XOR
|
||||
// pattern of the case info. It is the responsibility of the caller to ensure
|
||||
// that this is a rune with a XOR pattern defined.
|
||||
func (c *context) copyXOR() bool {
|
||||
if !c.copy() {
|
||||
return false
|
||||
}
|
||||
if c.info&xorIndexBit == 0 {
|
||||
// Fast path for 6-bit XOR pattern, which covers most cases.
|
||||
c.dst[c.pDst-1] ^= byte(c.info >> xorShift)
|
||||
} else {
|
||||
// Interpret XOR bits as an index.
|
||||
// TODO: test performance for unrolling this loop. Verify that we have
|
||||
// at least two bytes and at most three.
|
||||
idx := c.info >> xorShift
|
||||
for p := c.pDst - 1; ; p-- {
|
||||
c.dst[p] ^= xorData[idx]
|
||||
idx--
|
||||
if xorData[idx] == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// hasPrefix returns true if src[pSrc:] starts with the given string.
|
||||
func (c *context) hasPrefix(s string) bool {
|
||||
b := c.src[c.pSrc:]
|
||||
if len(b) < len(s) {
|
||||
return false
|
||||
}
|
||||
for i, c := range b[:len(s)] {
|
||||
if c != s[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// caseType returns an info with only the case bits, normalized to either
|
||||
// cLower, cUpper, cTitle or cUncased.
|
||||
func (c *context) caseType() info {
|
||||
cm := c.info & 0x7
|
||||
if cm < 4 {
|
||||
return cm
|
||||
}
|
||||
if cm >= cXORCase {
|
||||
// xor the last bit of the rune with the case type bits.
|
||||
b := c.src[c.pSrc+c.sz-1]
|
||||
return info(b&1) ^ cm&0x3
|
||||
}
|
||||
if cm == cIgnorableCased {
|
||||
return cLower
|
||||
}
|
||||
return cUncased
|
||||
}
|
||||
|
||||
// lower writes the lowercase version of the current rune to dst.
|
||||
func lower(c *context) bool {
|
||||
ct := c.caseType()
|
||||
if c.info&hasMappingMask == 0 || ct == cLower {
|
||||
return c.copy()
|
||||
}
|
||||
if c.info&exceptionBit == 0 {
|
||||
return c.copyXOR()
|
||||
}
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
offset := 2 + e[0]&lengthMask // size of header + fold string
|
||||
if nLower := (e[1] >> lengthBits) & lengthMask; nLower != noChange {
|
||||
return c.writeString(e[offset : offset+nLower])
|
||||
}
|
||||
return c.copy()
|
||||
}
|
||||
|
||||
func isLower(c *context) bool {
|
||||
ct := c.caseType()
|
||||
if c.info&hasMappingMask == 0 || ct == cLower {
|
||||
return true
|
||||
}
|
||||
if c.info&exceptionBit == 0 {
|
||||
c.err = transform.ErrEndOfSpan
|
||||
return false
|
||||
}
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
if nLower := (e[1] >> lengthBits) & lengthMask; nLower != noChange {
|
||||
c.err = transform.ErrEndOfSpan
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// upper writes the uppercase version of the current rune to dst.
|
||||
func upper(c *context) bool {
|
||||
ct := c.caseType()
|
||||
if c.info&hasMappingMask == 0 || ct == cUpper {
|
||||
return c.copy()
|
||||
}
|
||||
if c.info&exceptionBit == 0 {
|
||||
return c.copyXOR()
|
||||
}
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
offset := 2 + e[0]&lengthMask // size of header + fold string
|
||||
// Get length of first special case mapping.
|
||||
n := (e[1] >> lengthBits) & lengthMask
|
||||
if ct == cTitle {
|
||||
// The first special case mapping is for lower. Set n to the second.
|
||||
if n == noChange {
|
||||
n = 0
|
||||
}
|
||||
n, e = e[1]&lengthMask, e[n:]
|
||||
}
|
||||
if n != noChange {
|
||||
return c.writeString(e[offset : offset+n])
|
||||
}
|
||||
return c.copy()
|
||||
}
|
||||
|
||||
// isUpper writes the isUppercase version of the current rune to dst.
|
||||
func isUpper(c *context) bool {
|
||||
ct := c.caseType()
|
||||
if c.info&hasMappingMask == 0 || ct == cUpper {
|
||||
return true
|
||||
}
|
||||
if c.info&exceptionBit == 0 {
|
||||
c.err = transform.ErrEndOfSpan
|
||||
return false
|
||||
}
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
// Get length of first special case mapping.
|
||||
n := (e[1] >> lengthBits) & lengthMask
|
||||
if ct == cTitle {
|
||||
n = e[1] & lengthMask
|
||||
}
|
||||
if n != noChange {
|
||||
c.err = transform.ErrEndOfSpan
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// title writes the title case version of the current rune to dst.
|
||||
func title(c *context) bool {
|
||||
ct := c.caseType()
|
||||
if c.info&hasMappingMask == 0 || ct == cTitle {
|
||||
return c.copy()
|
||||
}
|
||||
if c.info&exceptionBit == 0 {
|
||||
if ct == cLower {
|
||||
return c.copyXOR()
|
||||
}
|
||||
return c.copy()
|
||||
}
|
||||
// Get the exception data.
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
offset := 2 + e[0]&lengthMask // size of header + fold string
|
||||
|
||||
nFirst := (e[1] >> lengthBits) & lengthMask
|
||||
if nTitle := e[1] & lengthMask; nTitle != noChange {
|
||||
if nFirst != noChange {
|
||||
e = e[nFirst:]
|
||||
}
|
||||
return c.writeString(e[offset : offset+nTitle])
|
||||
}
|
||||
if ct == cLower && nFirst != noChange {
|
||||
// Use the uppercase version instead.
|
||||
return c.writeString(e[offset : offset+nFirst])
|
||||
}
|
||||
// Already in correct case.
|
||||
return c.copy()
|
||||
}
|
||||
|
||||
// isTitle reports whether the current rune is in title case.
|
||||
func isTitle(c *context) bool {
|
||||
ct := c.caseType()
|
||||
if c.info&hasMappingMask == 0 || ct == cTitle {
|
||||
return true
|
||||
}
|
||||
if c.info&exceptionBit == 0 {
|
||||
if ct == cLower {
|
||||
c.err = transform.ErrEndOfSpan
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
// Get the exception data.
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
if nTitle := e[1] & lengthMask; nTitle != noChange {
|
||||
c.err = transform.ErrEndOfSpan
|
||||
return false
|
||||
}
|
||||
nFirst := (e[1] >> lengthBits) & lengthMask
|
||||
if ct == cLower && nFirst != noChange {
|
||||
c.err = transform.ErrEndOfSpan
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// foldFull writes the foldFull version of the current rune to dst.
|
||||
func foldFull(c *context) bool {
|
||||
if c.info&hasMappingMask == 0 {
|
||||
return c.copy()
|
||||
}
|
||||
ct := c.caseType()
|
||||
if c.info&exceptionBit == 0 {
|
||||
if ct != cLower || c.info&inverseFoldBit != 0 {
|
||||
return c.copyXOR()
|
||||
}
|
||||
return c.copy()
|
||||
}
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
n := e[0] & lengthMask
|
||||
if n == 0 {
|
||||
if ct == cLower {
|
||||
return c.copy()
|
||||
}
|
||||
n = (e[1] >> lengthBits) & lengthMask
|
||||
}
|
||||
return c.writeString(e[2 : 2+n])
|
||||
}
|
||||
|
||||
// isFoldFull reports whether the current run is mapped to foldFull
|
||||
func isFoldFull(c *context) bool {
|
||||
if c.info&hasMappingMask == 0 {
|
||||
return true
|
||||
}
|
||||
ct := c.caseType()
|
||||
if c.info&exceptionBit == 0 {
|
||||
if ct != cLower || c.info&inverseFoldBit != 0 {
|
||||
c.err = transform.ErrEndOfSpan
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
e := exceptions[c.info>>exceptionShift:]
|
||||
n := e[0] & lengthMask
|
||||
if n == 0 && ct == cLower {
|
||||
return true
|
||||
}
|
||||
c.err = transform.ErrEndOfSpan
|
||||
return false
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cases
|
||||
|
||||
import "golang.org/x/text/transform"
|
||||
|
||||
type caseFolder struct{ transform.NopResetter }
|
||||
|
||||
// caseFolder implements the Transformer interface for doing case folding.
|
||||
func (t *caseFolder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
c := context{dst: dst, src: src, atEOF: atEOF}
|
||||
for c.next() {
|
||||
foldFull(&c)
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
func (t *caseFolder) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
c := context{src: src, atEOF: atEOF}
|
||||
for c.next() && isFoldFull(&c) {
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.retSpan()
|
||||
}
|
||||
|
||||
func makeFold(o options) transform.SpanningTransformer {
|
||||
// TODO: Special case folding, through option Language, Special/Turkic, or
|
||||
// both.
|
||||
// TODO: Implement Compact options.
|
||||
return &caseFolder{}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build icu
|
||||
|
||||
package cases
|
||||
|
||||
// Ideally these functions would be defined in a test file, but go test doesn't
|
||||
// allow CGO in tests. The build tag should ensure either way that these
|
||||
// functions will not end up in the package.
|
||||
|
||||
// TODO: Ensure that the correct ICU version is set.
|
||||
|
||||
/*
|
||||
#cgo LDFLAGS: -licui18n.57 -licuuc.57
|
||||
#include <stdlib.h>
|
||||
#include <unicode/ustring.h>
|
||||
#include <unicode/utypes.h>
|
||||
#include <unicode/localpointer.h>
|
||||
#include <unicode/ucasemap.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import "unsafe"
|
||||
|
||||
func doICU(tag, caser, input string) string {
|
||||
err := C.UErrorCode(0)
|
||||
loc := C.CString(tag)
|
||||
cm := C.ucasemap_open(loc, C.uint32_t(0), &err)
|
||||
|
||||
buf := make([]byte, len(input)*4)
|
||||
dst := (*C.char)(unsafe.Pointer(&buf[0]))
|
||||
src := C.CString(input)
|
||||
|
||||
cn := C.int32_t(0)
|
||||
|
||||
switch caser {
|
||||
case "fold":
|
||||
cn = C.ucasemap_utf8FoldCase(cm,
|
||||
dst, C.int32_t(len(buf)),
|
||||
src, C.int32_t(len(input)),
|
||||
&err)
|
||||
case "lower":
|
||||
cn = C.ucasemap_utf8ToLower(cm,
|
||||
dst, C.int32_t(len(buf)),
|
||||
src, C.int32_t(len(input)),
|
||||
&err)
|
||||
case "upper":
|
||||
cn = C.ucasemap_utf8ToUpper(cm,
|
||||
dst, C.int32_t(len(buf)),
|
||||
src, C.int32_t(len(input)),
|
||||
&err)
|
||||
case "title":
|
||||
cn = C.ucasemap_utf8ToTitle(cm,
|
||||
dst, C.int32_t(len(buf)),
|
||||
src, C.int32_t(len(input)),
|
||||
&err)
|
||||
}
|
||||
return string(buf[:cn])
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cases
|
||||
|
||||
func (c info) cccVal() info {
|
||||
if c&exceptionBit != 0 {
|
||||
return info(exceptions[c>>exceptionShift]) & cccMask
|
||||
}
|
||||
return c & cccMask
|
||||
}
|
||||
|
||||
func (c info) cccType() info {
|
||||
ccc := c.cccVal()
|
||||
if ccc <= cccZero {
|
||||
return cccZero
|
||||
}
|
||||
return ccc
|
||||
}
|
||||
|
||||
// TODO: Implement full Unicode breaking algorithm:
|
||||
// 1) Implement breaking in separate package.
|
||||
// 2) Use the breaker here.
|
||||
// 3) Compare table size and performance of using the more generic breaker.
|
||||
//
|
||||
// Note that we can extend the current algorithm to be much more accurate. This
|
||||
// only makes sense, though, if the performance and/or space penalty of using
|
||||
// the generic breaker is big. Extra data will only be needed for non-cased
|
||||
// runes, which means there are sufficient bits left in the caseType.
|
||||
// ICU prohibits breaking in such cases as well.
|
||||
|
||||
// For the purpose of title casing we use an approximation of the Unicode Word
|
||||
// Breaking algorithm defined in Annex #29:
|
||||
// https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table.
|
||||
//
|
||||
// For our approximation, we group the Word Break types into the following
|
||||
// categories, with associated rules:
|
||||
//
|
||||
// 1) Letter:
|
||||
// ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend, Format_FE, ZWJ.
|
||||
// Rule: Never break between consecutive runes of this category.
|
||||
//
|
||||
// 2) Mid:
|
||||
// MidLetter, MidNumLet, Single_Quote.
|
||||
// (Cf. case-ignorable: MidLetter, MidNumLet, Single_Quote or cat is Mn,
|
||||
// Me, Cf, Lm or Sk).
|
||||
// Rule: Don't break between Letter and Mid, but break between two Mids.
|
||||
//
|
||||
// 3) Break:
|
||||
// Any other category: NewLine, MidNum, CR, LF, Double_Quote, Katakana, and
|
||||
// Other.
|
||||
// These categories should always result in a break between two cased letters.
|
||||
// Rule: Always break.
|
||||
//
|
||||
// Note 1: the Katakana and MidNum categories can, in esoteric cases, result in
|
||||
// preventing a break between two cased letters. For now we will ignore this
|
||||
// (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and
|
||||
// [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].)
|
||||
//
|
||||
// Note 2: the rule for Mid is very approximate, but works in most cases. To
|
||||
// improve, we could store the categories in the trie value and use a FA to
|
||||
// manage breaks. See TODO comment above.
|
||||
//
|
||||
// Note 3: according to the spec, it is possible for the Extend category to
|
||||
// introduce breaks between other categories grouped in Letter. However, this
|
||||
// is undesirable for our purposes. ICU prevents breaks in such cases as well.
|
||||
|
||||
// isBreak returns whether this rune should introduce a break.
|
||||
func (c info) isBreak() bool {
|
||||
return c.cccVal() == cccBreak
|
||||
}
|
||||
|
||||
// isLetter returns whether the rune is of break type ALetter, Hebrew_Letter,
|
||||
// Numeric, ExtendNumLet, or Extend.
|
||||
func (c info) isLetter() bool {
|
||||
ccc := c.cccVal()
|
||||
if ccc == cccZero {
|
||||
return !c.isCaseIgnorable()
|
||||
}
|
||||
return ccc != cccBreak
|
||||
}
|
|
@ -0,0 +1,816 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cases
|
||||
|
||||
// This file contains the definitions of case mappings for all supported
|
||||
// languages. The rules for the language-specific tailorings were taken and
|
||||
// modified from the CLDR transform definitions in common/transforms.
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/internal"
|
||||
"golang.org/x/text/language"
|
||||
"golang.org/x/text/transform"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
// A mapFunc takes a context set to the current rune and writes the mapped
|
||||
// version to the same context. It may advance the context to the next rune. It
|
||||
// returns whether a checkpoint is possible: whether the pDst bytes written to
|
||||
// dst so far won't need changing as we see more source bytes.
|
||||
type mapFunc func(*context) bool
|
||||
|
||||
// A spanFunc takes a context set to the current rune and returns whether this
|
||||
// rune would be altered when written to the output. It may advance the context
|
||||
// to the next rune. It returns whether a checkpoint is possible.
|
||||
type spanFunc func(*context) bool
|
||||
|
||||
// maxIgnorable defines the maximum number of ignorables to consider for
|
||||
// lookahead operations.
|
||||
const maxIgnorable = 30
|
||||
|
||||
// supported lists the language tags for which we have tailorings.
|
||||
const supported = "und af az el lt nl tr"
|
||||
|
||||
func init() {
|
||||
tags := []language.Tag{}
|
||||
for _, s := range strings.Split(supported, " ") {
|
||||
tags = append(tags, language.MustParse(s))
|
||||
}
|
||||
matcher = internal.NewInheritanceMatcher(tags)
|
||||
Supported = language.NewCoverage(tags)
|
||||
}
|
||||
|
||||
var (
|
||||
matcher *internal.InheritanceMatcher
|
||||
|
||||
Supported language.Coverage
|
||||
|
||||
// We keep the following lists separate, instead of having a single per-
|
||||
// language struct, to give the compiler a chance to remove unused code.
|
||||
|
||||
// Some uppercase mappers are stateless, so we can precompute the
|
||||
// Transformers and save a bit on runtime allocations.
|
||||
upperFunc = []struct {
|
||||
upper mapFunc
|
||||
span spanFunc
|
||||
}{
|
||||
{nil, nil}, // und
|
||||
{nil, nil}, // af
|
||||
{aztrUpper(upper), isUpper}, // az
|
||||
{elUpper, noSpan}, // el
|
||||
{ltUpper(upper), noSpan}, // lt
|
||||
{nil, nil}, // nl
|
||||
{aztrUpper(upper), isUpper}, // tr
|
||||
}
|
||||
|
||||
undUpper transform.SpanningTransformer = &undUpperCaser{}
|
||||
undLower transform.SpanningTransformer = &undLowerCaser{}
|
||||
undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{}
|
||||
|
||||
lowerFunc = []mapFunc{
|
||||
nil, // und
|
||||
nil, // af
|
||||
aztrLower, // az
|
||||
nil, // el
|
||||
ltLower, // lt
|
||||
nil, // nl
|
||||
aztrLower, // tr
|
||||
}
|
||||
|
||||
titleInfos = []struct {
|
||||
title mapFunc
|
||||
lower mapFunc
|
||||
titleSpan spanFunc
|
||||
rewrite func(*context)
|
||||
}{
|
||||
{title, lower, isTitle, nil}, // und
|
||||
{title, lower, isTitle, afnlRewrite}, // af
|
||||
{aztrUpper(title), aztrLower, isTitle, nil}, // az
|
||||
{title, lower, isTitle, nil}, // el
|
||||
{ltUpper(title), ltLower, noSpan, nil}, // lt
|
||||
{nlTitle, lower, nlTitleSpan, afnlRewrite}, // nl
|
||||
{aztrUpper(title), aztrLower, isTitle, nil}, // tr
|
||||
}
|
||||
)
|
||||
|
||||
func makeUpper(t language.Tag, o options) transform.SpanningTransformer {
|
||||
_, i, _ := matcher.Match(t)
|
||||
f := upperFunc[i].upper
|
||||
if f == nil {
|
||||
return undUpper
|
||||
}
|
||||
return &simpleCaser{f: f, span: upperFunc[i].span}
|
||||
}
|
||||
|
||||
func makeLower(t language.Tag, o options) transform.SpanningTransformer {
|
||||
_, i, _ := matcher.Match(t)
|
||||
f := lowerFunc[i]
|
||||
if f == nil {
|
||||
if o.ignoreFinalSigma {
|
||||
return undLowerIgnoreSigma
|
||||
}
|
||||
return undLower
|
||||
}
|
||||
if o.ignoreFinalSigma {
|
||||
return &simpleCaser{f: f, span: isLower}
|
||||
}
|
||||
return &lowerCaser{
|
||||
first: f,
|
||||
midWord: finalSigma(f),
|
||||
}
|
||||
}
|
||||
|
||||
func makeTitle(t language.Tag, o options) transform.SpanningTransformer {
|
||||
_, i, _ := matcher.Match(t)
|
||||
x := &titleInfos[i]
|
||||
lower := x.lower
|
||||
if o.noLower {
|
||||
lower = (*context).copy
|
||||
} else if !o.ignoreFinalSigma {
|
||||
lower = finalSigma(lower)
|
||||
}
|
||||
return &titleCaser{
|
||||
title: x.title,
|
||||
lower: lower,
|
||||
titleSpan: x.titleSpan,
|
||||
rewrite: x.rewrite,
|
||||
}
|
||||
}
|
||||
|
||||
func noSpan(c *context) bool {
|
||||
c.err = transform.ErrEndOfSpan
|
||||
return false
|
||||
}
|
||||
|
||||
// TODO: consider a similar special case for the fast majority lower case. This
|
||||
// is a bit more involved so will require some more precise benchmarking to
|
||||
// justify it.
|
||||
|
||||
type undUpperCaser struct{ transform.NopResetter }
|
||||
|
||||
// undUpperCaser implements the Transformer interface for doing an upper case
|
||||
// mapping for the root locale (und). It eliminates the need for an allocation
|
||||
// as it prevents escaping by not using function pointers.
|
||||
func (t undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
c := context{dst: dst, src: src, atEOF: atEOF}
|
||||
for c.next() {
|
||||
upper(&c)
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
func (t undUpperCaser) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
c := context{src: src, atEOF: atEOF}
|
||||
for c.next() && isUpper(&c) {
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.retSpan()
|
||||
}
|
||||
|
||||
// undLowerIgnoreSigmaCaser implements the Transformer interface for doing
|
||||
// a lower case mapping for the root locale (und) ignoring final sigma
|
||||
// handling. This casing algorithm is used in some performance-critical packages
|
||||
// like secure/precis and x/net/http/idna, which warrants its special-casing.
|
||||
type undLowerIgnoreSigmaCaser struct{ transform.NopResetter }
|
||||
|
||||
func (t undLowerIgnoreSigmaCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
c := context{dst: dst, src: src, atEOF: atEOF}
|
||||
for c.next() && lower(&c) {
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.ret()
|
||||
|
||||
}
|
||||
|
||||
// Span implements a generic lower-casing. This is possible as isLower works
|
||||
// for all lowercasing variants. All lowercase variants only vary in how they
|
||||
// transform a non-lowercase letter. They will never change an already lowercase
|
||||
// letter. In addition, there is no state.
|
||||
func (t undLowerIgnoreSigmaCaser) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
c := context{src: src, atEOF: atEOF}
|
||||
for c.next() && isLower(&c) {
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.retSpan()
|
||||
}
|
||||
|
||||
type simpleCaser struct {
|
||||
context
|
||||
f mapFunc
|
||||
span spanFunc
|
||||
}
|
||||
|
||||
// simpleCaser implements the Transformer interface for doing a case operation
|
||||
// on a rune-by-rune basis.
|
||||
func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
c := context{dst: dst, src: src, atEOF: atEOF}
|
||||
for c.next() && t.f(&c) {
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
func (t *simpleCaser) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
c := context{src: src, atEOF: atEOF}
|
||||
for c.next() && t.span(&c) {
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.retSpan()
|
||||
}
|
||||
|
||||
// undLowerCaser implements the Transformer interface for doing a lower case
|
||||
// mapping for the root locale (und) ignoring final sigma handling. This casing
|
||||
// algorithm is used in some performance-critical packages like secure/precis
|
||||
// and x/net/http/idna, which warrants its special-casing.
|
||||
type undLowerCaser struct{ transform.NopResetter }
|
||||
|
||||
func (t undLowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
c := context{dst: dst, src: src, atEOF: atEOF}
|
||||
|
||||
for isInterWord := true; c.next(); {
|
||||
if isInterWord {
|
||||
if c.info.isCased() {
|
||||
if !lower(&c) {
|
||||
break
|
||||
}
|
||||
isInterWord = false
|
||||
} else if !c.copy() {
|
||||
break
|
||||
}
|
||||
} else {
|
||||
if c.info.isNotCasedAndNotCaseIgnorable() {
|
||||
if !c.copy() {
|
||||
break
|
||||
}
|
||||
isInterWord = true
|
||||
} else if !c.hasPrefix("Σ") {
|
||||
if !lower(&c) {
|
||||
break
|
||||
}
|
||||
} else if !finalSigmaBody(&c) {
|
||||
break
|
||||
}
|
||||
}
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
func (t undLowerCaser) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
c := context{src: src, atEOF: atEOF}
|
||||
for c.next() && isLower(&c) {
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.retSpan()
|
||||
}
|
||||
|
||||
// lowerCaser implements the Transformer interface. The default Unicode lower
|
||||
// casing requires different treatment for the first and subsequent characters
|
||||
// of a word, most notably to handle the Greek final Sigma.
|
||||
type lowerCaser struct {
|
||||
undLowerIgnoreSigmaCaser
|
||||
|
||||
context
|
||||
|
||||
first, midWord mapFunc
|
||||
}
|
||||
|
||||
func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
t.context = context{dst: dst, src: src, atEOF: atEOF}
|
||||
c := &t.context
|
||||
|
||||
for isInterWord := true; c.next(); {
|
||||
if isInterWord {
|
||||
if c.info.isCased() {
|
||||
if !t.first(c) {
|
||||
break
|
||||
}
|
||||
isInterWord = false
|
||||
} else if !c.copy() {
|
||||
break
|
||||
}
|
||||
} else {
|
||||
if c.info.isNotCasedAndNotCaseIgnorable() {
|
||||
if !c.copy() {
|
||||
break
|
||||
}
|
||||
isInterWord = true
|
||||
} else if !t.midWord(c) {
|
||||
break
|
||||
}
|
||||
}
|
||||
c.checkpoint()
|
||||
}
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
// titleCaser implements the Transformer interface. Title casing algorithms
|
||||
// distinguish between the first letter of a word and subsequent letters of the
|
||||
// same word. It uses state to avoid requiring a potentially infinite lookahead.
|
||||
type titleCaser struct {
|
||||
context
|
||||
|
||||
// rune mappings used by the actual casing algorithms.
|
||||
title mapFunc
|
||||
lower mapFunc
|
||||
titleSpan spanFunc
|
||||
|
||||
rewrite func(*context)
|
||||
}
|
||||
|
||||
// Transform implements the standard Unicode title case algorithm as defined in
|
||||
// Chapter 3 of The Unicode Standard:
|
||||
// toTitlecase(X): Find the word boundaries in X according to Unicode Standard
|
||||
// Annex #29, "Unicode Text Segmentation." For each word boundary, find the
|
||||
// first cased character F following the word boundary. If F exists, map F to
|
||||
// Titlecase_Mapping(F); then map all characters C between F and the following
|
||||
// word boundary to Lowercase_Mapping(C).
|
||||
func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord}
|
||||
c := &t.context
|
||||
|
||||
if !c.next() {
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
for {
|
||||
p := c.info
|
||||
if t.rewrite != nil {
|
||||
t.rewrite(c)
|
||||
}
|
||||
|
||||
wasMid := p.isMid()
|
||||
// Break out of this loop on failure to ensure we do not modify the
|
||||
// state incorrectly.
|
||||
if p.isCased() {
|
||||
if !c.isMidWord {
|
||||
if !t.title(c) {
|
||||
break
|
||||
}
|
||||
c.isMidWord = true
|
||||
} else if !t.lower(c) {
|
||||
break
|
||||
}
|
||||
} else if !c.copy() {
|
||||
break
|
||||
} else if p.isBreak() {
|
||||
c.isMidWord = false
|
||||
}
|
||||
|
||||
// As we save the state of the transformer, it is safe to call
|
||||
// checkpoint after any successful write.
|
||||
if !(c.isMidWord && wasMid) {
|
||||
c.checkpoint()
|
||||
}
|
||||
|
||||
if !c.next() {
|
||||
break
|
||||
}
|
||||
if wasMid && c.info.isMid() {
|
||||
c.isMidWord = false
|
||||
}
|
||||
}
|
||||
return c.ret()
|
||||
}
|
||||
|
||||
func (t *titleCaser) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
t.context = context{src: src, atEOF: atEOF, isMidWord: t.isMidWord}
|
||||
c := &t.context
|
||||
|
||||
if !c.next() {
|
||||
return c.retSpan()
|
||||
}
|
||||
|
||||
for {
|
||||
p := c.info
|
||||
if t.rewrite != nil {
|
||||
t.rewrite(c)
|
||||
}
|
||||
|
||||
wasMid := p.isMid()
|
||||
// Break out of this loop on failure to ensure we do not modify the
|
||||
// state incorrectly.
|
||||
if p.isCased() {
|
||||
if !c.isMidWord {
|
||||
if !t.titleSpan(c) {
|
||||
break
|
||||
}
|
||||
c.isMidWord = true
|
||||
} else if !isLower(c) {
|
||||
break
|
||||
}
|
||||
} else if p.isBreak() {
|
||||
c.isMidWord = false
|
||||
}
|
||||
// As we save the state of the transformer, it is safe to call
|
||||
// checkpoint after any successful write.
|
||||
if !(c.isMidWord && wasMid) {
|
||||
c.checkpoint()
|
||||
}
|
||||
|
||||
if !c.next() {
|
||||
break
|
||||
}
|
||||
if wasMid && c.info.isMid() {
|
||||
c.isMidWord = false
|
||||
}
|
||||
}
|
||||
return c.retSpan()
|
||||
}
|
||||
|
||||
// finalSigma adds Greek final Sigma handing to another casing function. It
|
||||
// determines whether a lowercased sigma should be σ or ς, by looking ahead for
|
||||
// case-ignorables and a cased letters.
|
||||
func finalSigma(f mapFunc) mapFunc {
|
||||
return func(c *context) bool {
|
||||
if !c.hasPrefix("Σ") {
|
||||
return f(c)
|
||||
}
|
||||
return finalSigmaBody(c)
|
||||
}
|
||||
}
|
||||
|
||||
func finalSigmaBody(c *context) bool {
|
||||
// Current rune must be ∑.
|
||||
|
||||
// ::NFD();
|
||||
// # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
|
||||
// Σ } [:case-ignorable:]* [:cased:] → σ;
|
||||
// [:cased:] [:case-ignorable:]* { Σ → ς;
|
||||
// ::Any-Lower;
|
||||
// ::NFC();
|
||||
|
||||
p := c.pDst
|
||||
c.writeString("ς")
|
||||
|
||||
// TODO: we should do this here, but right now this will never have an
|
||||
// effect as this is called when the prefix is Sigma, whereas Dutch and
|
||||
// Afrikaans only test for an apostrophe.
|
||||
//
|
||||
// if t.rewrite != nil {
|
||||
// t.rewrite(c)
|
||||
// }
|
||||
|
||||
// We need to do one more iteration after maxIgnorable, as a cased
|
||||
// letter is not an ignorable and may modify the result.
|
||||
wasMid := false
|
||||
for i := 0; i < maxIgnorable+1; i++ {
|
||||
if !c.next() {
|
||||
return false
|
||||
}
|
||||
if !c.info.isCaseIgnorable() {
|
||||
// All Midword runes are also case ignorable, so we are
|
||||
// guaranteed to have a letter or word break here. As we are
|
||||
// unreading the run, there is no need to unset c.isMidWord;
|
||||
// the title caser will handle this.
|
||||
if c.info.isCased() {
|
||||
// p+1 is guaranteed to be in bounds: if writing ς was
|
||||
// successful, p+1 will contain the second byte of ς. If not,
|
||||
// this function will have returned after c.next returned false.
|
||||
c.dst[p+1]++ // ς → σ
|
||||
}
|
||||
c.unreadRune()
|
||||
return true
|
||||
}
|
||||
// A case ignorable may also introduce a word break, so we may need
|
||||
// to continue searching even after detecting a break.
|
||||
isMid := c.info.isMid()
|
||||
if (wasMid && isMid) || c.info.isBreak() {
|
||||
c.isMidWord = false
|
||||
}
|
||||
wasMid = isMid
|
||||
c.copy()
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// finalSigmaSpan would be the same as isLower.
|
||||
|
||||
// elUpper implements Greek upper casing, which entails removing a predefined
|
||||
// set of non-blocked modifiers. Note that these accents should not be removed
|
||||
// for title casing!
|
||||
// Example: "Οδός" -> "ΟΔΟΣ".
|
||||
func elUpper(c *context) bool {
|
||||
// From CLDR:
|
||||
// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ;
|
||||
// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;
|
||||
|
||||
r, _ := utf8.DecodeRune(c.src[c.pSrc:])
|
||||
oldPDst := c.pDst
|
||||
if !upper(c) {
|
||||
return false
|
||||
}
|
||||
if !unicode.Is(unicode.Greek, r) {
|
||||
return true
|
||||
}
|
||||
i := 0
|
||||
// Take the properties of the uppercased rune that is already written to the
|
||||
// destination. This saves us the trouble of having to uppercase the
|
||||
// decomposed rune again.
|
||||
if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil {
|
||||
// Restore the destination position and process the decomposed rune.
|
||||
r, sz := utf8.DecodeRune(b)
|
||||
if r <= 0xFF { // See A.6.1
|
||||
return true
|
||||
}
|
||||
c.pDst = oldPDst
|
||||
// Insert the first rune and ignore the modifiers. See A.6.2.
|
||||
c.writeBytes(b[:sz])
|
||||
i = len(b[sz:]) / 2 // Greek modifiers are always of length 2.
|
||||
}
|
||||
|
||||
for ; i < maxIgnorable && c.next(); i++ {
|
||||
switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r {
|
||||
// Above and Iota Subscript
|
||||
case 0x0300, // U+0300 COMBINING GRAVE ACCENT
|
||||
0x0301, // U+0301 COMBINING ACUTE ACCENT
|
||||
0x0304, // U+0304 COMBINING MACRON
|
||||
0x0306, // U+0306 COMBINING BREVE
|
||||
0x0308, // U+0308 COMBINING DIAERESIS
|
||||
0x0313, // U+0313 COMBINING COMMA ABOVE
|
||||
0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
|
||||
0x0342, // U+0342 COMBINING GREEK PERISPOMENI
|
||||
0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI
|
||||
// No-op. Gobble the modifier.
|
||||
|
||||
default:
|
||||
switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() {
|
||||
case cccZero:
|
||||
c.unreadRune()
|
||||
return true
|
||||
|
||||
// We don't need to test for IotaSubscript as the only rune that
|
||||
// qualifies (U+0345) was already excluded in the switch statement
|
||||
// above. See A.4.
|
||||
|
||||
case cccAbove:
|
||||
return c.copy()
|
||||
default:
|
||||
// Some other modifier. We're still allowed to gobble Greek
|
||||
// modifiers after this.
|
||||
c.copy()
|
||||
}
|
||||
}
|
||||
}
|
||||
return i == maxIgnorable
|
||||
}
|
||||
|
||||
// TODO: implement elUpperSpan (low-priority: complex and infrequent).
|
||||
|
||||
func ltLower(c *context) bool {
|
||||
// From CLDR:
|
||||
// # Introduce an explicit dot above when lowercasing capital I's and J's
|
||||
// # whenever there are more accents above.
|
||||
// # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
|
||||
// # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
|
||||
// # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
|
||||
// # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
|
||||
// # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
// # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
// # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
|
||||
// ::NFD();
|
||||
// I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
|
||||
// J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
|
||||
// I \u0328 (Į) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307;
|
||||
// I \u0300 (Ì) → i \u0307 \u0300;
|
||||
// I \u0301 (Í) → i \u0307 \u0301;
|
||||
// I \u0303 (Ĩ) → i \u0307 \u0303;
|
||||
// ::Any-Lower();
|
||||
// ::NFC();
|
||||
|
||||
i := 0
|
||||
if r := c.src[c.pSrc]; r < utf8.RuneSelf {
|
||||
lower(c)
|
||||
if r != 'I' && r != 'J' {
|
||||
return true
|
||||
}
|
||||
} else {
|
||||
p := norm.NFD.Properties(c.src[c.pSrc:])
|
||||
if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') {
|
||||
// UTF-8 optimization: the decomposition will only have an above
|
||||
// modifier if the last rune of the decomposition is in [U+300-U+311].
|
||||
// In all other cases, a decomposition starting with I is always
|
||||
// an I followed by modifiers that are not cased themselves. See A.2.
|
||||
if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4.
|
||||
if !c.writeBytes(d[:1]) {
|
||||
return false
|
||||
}
|
||||
c.dst[c.pDst-1] += 'a' - 'A' // lower
|
||||
|
||||
// Assumption: modifier never changes on lowercase. See A.1.
|
||||
// Assumption: all modifiers added have CCC = Above. See A.2.3.
|
||||
return c.writeString("\u0307") && c.writeBytes(d[1:])
|
||||
}
|
||||
// In all other cases the additional modifiers will have a CCC
|
||||
// that is less than 230 (Above). We will insert the U+0307, if
|
||||
// needed, after these modifiers so that a string in FCD form
|
||||
// will remain so. See A.2.2.
|
||||
lower(c)
|
||||
i = 1
|
||||
} else {
|
||||
return lower(c)
|
||||
}
|
||||
}
|
||||
|
||||
for ; i < maxIgnorable && c.next(); i++ {
|
||||
switch c.info.cccType() {
|
||||
case cccZero:
|
||||
c.unreadRune()
|
||||
return true
|
||||
case cccAbove:
|
||||
return c.writeString("\u0307") && c.copy() // See A.1.
|
||||
default:
|
||||
c.copy() // See A.1.
|
||||
}
|
||||
}
|
||||
return i == maxIgnorable
|
||||
}
|
||||
|
||||
// ltLowerSpan would be the same as isLower.
|
||||
|
||||
func ltUpper(f mapFunc) mapFunc {
|
||||
return func(c *context) bool {
|
||||
// Unicode:
|
||||
// 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
|
||||
//
|
||||
// From CLDR:
|
||||
// # Remove \u0307 following soft-dotteds (i, j, and the like), with possible
|
||||
// # intervening non-230 marks.
|
||||
// ::NFD();
|
||||
// [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
|
||||
// ::Any-Upper();
|
||||
// ::NFC();
|
||||
|
||||
// TODO: See A.5. A soft-dotted rune never has an exception. This would
|
||||
// allow us to overload the exception bit and encode this property in
|
||||
// info. Need to measure performance impact of this.
|
||||
r, _ := utf8.DecodeRune(c.src[c.pSrc:])
|
||||
oldPDst := c.pDst
|
||||
if !f(c) {
|
||||
return false
|
||||
}
|
||||
if !unicode.Is(unicode.Soft_Dotted, r) {
|
||||
return true
|
||||
}
|
||||
|
||||
// We don't need to do an NFD normalization, as a soft-dotted rune never
|
||||
// contains U+0307. See A.3.
|
||||
|
||||
i := 0
|
||||
for ; i < maxIgnorable && c.next(); i++ {
|
||||
switch c.info.cccType() {
|
||||
case cccZero:
|
||||
c.unreadRune()
|
||||
return true
|
||||
case cccAbove:
|
||||
if c.hasPrefix("\u0307") {
|
||||
// We don't do a full NFC, but rather combine runes for
|
||||
// some of the common cases. (Returning NFC or
|
||||
// preserving normal form is neither a requirement nor
|
||||
// a possibility anyway).
|
||||
if !c.next() {
|
||||
return false
|
||||
}
|
||||
if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc {
|
||||
s := ""
|
||||
switch c.src[c.pSrc+1] {
|
||||
case 0x80: // U+0300 COMBINING GRAVE ACCENT
|
||||
s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
|
||||
case 0x81: // U+0301 COMBINING ACUTE ACCENT
|
||||
s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
|
||||
case 0x83: // U+0303 COMBINING TILDE
|
||||
s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
|
||||
case 0x88: // U+0308 COMBINING DIAERESIS
|
||||
s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
|
||||
default:
|
||||
}
|
||||
if s != "" {
|
||||
c.pDst = oldPDst
|
||||
return c.writeString(s)
|
||||
}
|
||||
}
|
||||
}
|
||||
return c.copy()
|
||||
default:
|
||||
c.copy()
|
||||
}
|
||||
}
|
||||
return i == maxIgnorable
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: implement ltUpperSpan (low priority: complex and infrequent).
|
||||
|
||||
func aztrUpper(f mapFunc) mapFunc {
|
||||
return func(c *context) bool {
|
||||
// i→İ;
|
||||
if c.src[c.pSrc] == 'i' {
|
||||
return c.writeString("İ")
|
||||
}
|
||||
return f(c)
|
||||
}
|
||||
}
|
||||
|
||||
func aztrLower(c *context) (done bool) {
|
||||
// From CLDR:
|
||||
// # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
|
||||
// # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
// İ→i;
|
||||
// # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
|
||||
// # This matches the behavior of the canonically equivalent I-dot_above
|
||||
// # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
|
||||
// # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
|
||||
// # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
|
||||
// I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ;
|
||||
// I→ı ;
|
||||
// ::Any-Lower();
|
||||
if c.hasPrefix("\u0130") { // İ
|
||||
return c.writeString("i")
|
||||
}
|
||||
if c.src[c.pSrc] != 'I' {
|
||||
return lower(c)
|
||||
}
|
||||
|
||||
// We ignore the lower-case I for now, but insert it later when we know
|
||||
// which form we need.
|
||||
start := c.pSrc + c.sz
|
||||
|
||||
i := 0
|
||||
Loop:
|
||||
// We check for up to n ignorables before \u0307. As \u0307 is an
|
||||
// ignorable as well, n is maxIgnorable-1.
|
||||
for ; i < maxIgnorable && c.next(); i++ {
|
||||
switch c.info.cccType() {
|
||||
case cccAbove:
|
||||
if c.hasPrefix("\u0307") {
|
||||
return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307
|
||||
}
|
||||
done = true
|
||||
break Loop
|
||||
case cccZero:
|
||||
c.unreadRune()
|
||||
done = true
|
||||
break Loop
|
||||
default:
|
||||
// We'll write this rune after we know which starter to use.
|
||||
}
|
||||
}
|
||||
if i == maxIgnorable {
|
||||
done = true
|
||||
}
|
||||
return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done
|
||||
}
|
||||
|
||||
// aztrLowerSpan would be the same as isLower.
|
||||
|
||||
func nlTitle(c *context) bool {
|
||||
// From CLDR:
|
||||
// # Special titlecasing for Dutch initial "ij".
|
||||
// ::Any-Title();
|
||||
// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
|
||||
// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
|
||||
if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' {
|
||||
return title(c)
|
||||
}
|
||||
|
||||
if !c.writeString("I") || !c.next() {
|
||||
return false
|
||||
}
|
||||
if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' {
|
||||
return c.writeString("J")
|
||||
}
|
||||
c.unreadRune()
|
||||
return true
|
||||
}
|
||||
|
||||
func nlTitleSpan(c *context) bool {
|
||||
// From CLDR:
|
||||
// # Special titlecasing for Dutch initial "ij".
|
||||
// ::Any-Title();
|
||||
// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
|
||||
// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
|
||||
if c.src[c.pSrc] != 'I' {
|
||||
return isTitle(c)
|
||||
}
|
||||
if !c.next() || c.src[c.pSrc] == 'j' {
|
||||
return false
|
||||
}
|
||||
if c.src[c.pSrc] != 'J' {
|
||||
c.unreadRune()
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Not part of CLDR, but see https://unicode.org/cldr/trac/ticket/7078.
|
||||
func afnlRewrite(c *context) {
|
||||
if c.hasPrefix("'") || c.hasPrefix("’") {
|
||||
c.isMidWord = true
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,217 @@
|
|||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||
|
||||
package cases
|
||||
|
||||
// This file contains definitions for interpreting the trie value of the case
|
||||
// trie generated by "go run gen*.go". It is shared by both the generator
|
||||
// program and the resultant package. Sharing is achieved by the generator
|
||||
// copying gen_trieval.go to trieval.go and changing what's above this comment.
|
||||
|
||||
// info holds case information for a single rune. It is the value returned
|
||||
// by a trie lookup. Most mapping information can be stored in a single 16-bit
|
||||
// value. If not, for example when a rune is mapped to multiple runes, the value
|
||||
// stores some basic case data and an index into an array with additional data.
|
||||
//
|
||||
// The per-rune values have the following format:
|
||||
//
|
||||
// if (exception) {
|
||||
// 15..4 unsigned exception index
|
||||
// } else {
|
||||
// 15..8 XOR pattern or index to XOR pattern for case mapping
|
||||
// Only 13..8 are used for XOR patterns.
|
||||
// 7 inverseFold (fold to upper, not to lower)
|
||||
// 6 index: interpret the XOR pattern as an index
|
||||
// or isMid if case mode is cIgnorableUncased.
|
||||
// 5..4 CCC: zero (normal or break), above or other
|
||||
// }
|
||||
// 3 exception: interpret this value as an exception index
|
||||
// (TODO: is this bit necessary? Probably implied from case mode.)
|
||||
// 2..0 case mode
|
||||
//
|
||||
// For the non-exceptional cases, a rune must be either uncased, lowercase or
|
||||
// uppercase. If the rune is cased, the XOR pattern maps either a lowercase
|
||||
// rune to uppercase or an uppercase rune to lowercase (applied to the 10
|
||||
// least-significant bits of the rune).
|
||||
//
|
||||
// See the definitions below for a more detailed description of the various
|
||||
// bits.
|
||||
type info uint16
|
||||
|
||||
const (
|
||||
casedMask = 0x0003
|
||||
fullCasedMask = 0x0007
|
||||
ignorableMask = 0x0006
|
||||
ignorableValue = 0x0004
|
||||
|
||||
inverseFoldBit = 1 << 7
|
||||
isMidBit = 1 << 6
|
||||
|
||||
exceptionBit = 1 << 3
|
||||
exceptionShift = 4
|
||||
numExceptionBits = 12
|
||||
|
||||
xorIndexBit = 1 << 6
|
||||
xorShift = 8
|
||||
|
||||
// There is no mapping if all xor bits and the exception bit are zero.
|
||||
hasMappingMask = 0xff80 | exceptionBit
|
||||
)
|
||||
|
||||
// The case mode bits encodes the case type of a rune. This includes uncased,
|
||||
// title, upper and lower case and case ignorable. (For a definition of these
|
||||
// terms see Chapter 3 of The Unicode Standard Core Specification.) In some rare
|
||||
// cases, a rune can be both cased and case-ignorable. This is encoded by
|
||||
// cIgnorableCased. A rune of this type is always lower case. Some runes are
|
||||
// cased while not having a mapping.
|
||||
//
|
||||
// A common pattern for scripts in the Unicode standard is for upper and lower
|
||||
// case runes to alternate for increasing rune values (e.g. the accented Latin
|
||||
// ranges starting from U+0100 and U+1E00 among others and some Cyrillic
|
||||
// characters). We use this property by defining a cXORCase mode, where the case
|
||||
// mode (always upper or lower case) is derived from the rune value. As the XOR
|
||||
// pattern for case mappings is often identical for successive runes, using
|
||||
// cXORCase can result in large series of identical trie values. This, in turn,
|
||||
// allows us to better compress the trie blocks.
|
||||
const (
|
||||
cUncased info = iota // 000
|
||||
cTitle // 001
|
||||
cLower // 010
|
||||
cUpper // 011
|
||||
cIgnorableUncased // 100
|
||||
cIgnorableCased // 101 // lower case if mappings exist
|
||||
cXORCase // 11x // case is cLower | ((rune&1) ^ x)
|
||||
|
||||
maxCaseMode = cUpper
|
||||
)
|
||||
|
||||
func (c info) isCased() bool {
|
||||
return c&casedMask != 0
|
||||
}
|
||||
|
||||
func (c info) isCaseIgnorable() bool {
|
||||
return c&ignorableMask == ignorableValue
|
||||
}
|
||||
|
||||
func (c info) isNotCasedAndNotCaseIgnorable() bool {
|
||||
return c&fullCasedMask == 0
|
||||
}
|
||||
|
||||
func (c info) isCaseIgnorableAndNotCased() bool {
|
||||
return c&fullCasedMask == cIgnorableUncased
|
||||
}
|
||||
|
||||
func (c info) isMid() bool {
|
||||
return c&(fullCasedMask|isMidBit) == isMidBit|cIgnorableUncased
|
||||
}
|
||||
|
||||
// The case mapping implementation will need to know about various Canonical
|
||||
// Combining Class (CCC) values. We encode two of these in the trie value:
|
||||
// cccZero (0) and cccAbove (230). If the value is cccOther, it means that
|
||||
// CCC(r) > 0, but not 230. A value of cccBreak means that CCC(r) == 0 and that
|
||||
// the rune also has the break category Break (see below).
|
||||
const (
|
||||
cccBreak info = iota << 4
|
||||
cccZero
|
||||
cccAbove
|
||||
cccOther
|
||||
|
||||
cccMask = cccBreak | cccZero | cccAbove | cccOther
|
||||
)
|
||||
|
||||
const (
|
||||
starter = 0
|
||||
above = 230
|
||||
iotaSubscript = 240
|
||||
)
|
||||
|
||||
// The exceptions slice holds data that does not fit in a normal info entry.
|
||||
// The entry is pointed to by the exception index in an entry. It has the
|
||||
// following format:
|
||||
//
|
||||
// Header:
|
||||
//
|
||||
// byte 0:
|
||||
// 7..6 unused
|
||||
// 5..4 CCC type (same bits as entry)
|
||||
// 3 unused
|
||||
// 2..0 length of fold
|
||||
//
|
||||
// byte 1:
|
||||
// 7..6 unused
|
||||
// 5..3 length of 1st mapping of case type
|
||||
// 2..0 length of 2nd mapping of case type
|
||||
//
|
||||
// case 1st 2nd
|
||||
// lower -> upper, title
|
||||
// upper -> lower, title
|
||||
// title -> lower, upper
|
||||
//
|
||||
// Lengths with the value 0x7 indicate no value and implies no change.
|
||||
// A length of 0 indicates a mapping to zero-length string.
|
||||
//
|
||||
// Body bytes:
|
||||
//
|
||||
// case folding bytes
|
||||
// lowercase mapping bytes
|
||||
// uppercase mapping bytes
|
||||
// titlecase mapping bytes
|
||||
// closure mapping bytes (for NFKC_Casefold). (TODO)
|
||||
//
|
||||
// Fallbacks:
|
||||
//
|
||||
// missing fold -> lower
|
||||
// missing title -> upper
|
||||
// all missing -> original rune
|
||||
//
|
||||
// exceptions starts with a dummy byte to enforce that there is no zero index
|
||||
// value.
|
||||
const (
|
||||
lengthMask = 0x07
|
||||
lengthBits = 3
|
||||
noChange = 0
|
||||
)
|
||||
|
||||
// References to generated trie.
|
||||
|
||||
var trie = newCaseTrie(0)
|
||||
|
||||
var sparse = sparseBlocks{
|
||||
values: sparseValues[:],
|
||||
offsets: sparseOffsets[:],
|
||||
}
|
||||
|
||||
// Sparse block lookup code.
|
||||
|
||||
// valueRange is an entry in a sparse block.
|
||||
type valueRange struct {
|
||||
value uint16
|
||||
lo, hi byte
|
||||
}
|
||||
|
||||
type sparseBlocks struct {
|
||||
values []valueRange
|
||||
offsets []uint16
|
||||
}
|
||||
|
||||
// lookup returns the value from values block n for byte b using binary search.
|
||||
func (s *sparseBlocks) lookup(n uint32, b byte) uint16 {
|
||||
lo := s.offsets[n]
|
||||
hi := s.offsets[n+1]
|
||||
for lo < hi {
|
||||
m := lo + (hi-lo)/2
|
||||
r := s.values[m]
|
||||
if r.lo <= b && b <= r.hi {
|
||||
return r.value
|
||||
}
|
||||
if b < r.lo {
|
||||
hi = m
|
||||
} else {
|
||||
lo = m + 1
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// lastRuneForTesting is the last rune used for testing. Everything after this
|
||||
// is boring.
|
||||
const lastRuneForTesting = rune(0x1FFFF)
|
|
@ -0,0 +1,49 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package internal contains non-exported functionality that are used by
|
||||
// packages in the text repository.
|
||||
package internal // import "golang.org/x/text/internal"
|
||||
|
||||
import (
|
||||
"sort"
|
||||
|
||||
"golang.org/x/text/language"
|
||||
)
|
||||
|
||||
// SortTags sorts tags in place.
|
||||
func SortTags(tags []language.Tag) {
|
||||
sort.Sort(sorter(tags))
|
||||
}
|
||||
|
||||
type sorter []language.Tag
|
||||
|
||||
func (s sorter) Len() int {
|
||||
return len(s)
|
||||
}
|
||||
|
||||
func (s sorter) Swap(i, j int) {
|
||||
s[i], s[j] = s[j], s[i]
|
||||
}
|
||||
|
||||
func (s sorter) Less(i, j int) bool {
|
||||
return s[i].String() < s[j].String()
|
||||
}
|
||||
|
||||
// UniqueTags sorts and filters duplicate tags in place and returns a slice with
|
||||
// only unique tags.
|
||||
func UniqueTags(tags []language.Tag) []language.Tag {
|
||||
if len(tags) <= 1 {
|
||||
return tags
|
||||
}
|
||||
SortTags(tags)
|
||||
k := 0
|
||||
for i := 1; i < len(tags); i++ {
|
||||
if tags[k].String() < tags[i].String() {
|
||||
k++
|
||||
tags[k] = tags[i]
|
||||
}
|
||||
}
|
||||
return tags[:k+1]
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||
|
||||
package language
|
||||
|
||||
// This file contains code common to the maketables.go and the package code.
|
||||
|
||||
// AliasType is the type of an alias in AliasMap.
|
||||
type AliasType int8
|
||||
|
||||
const (
|
||||
Deprecated AliasType = iota
|
||||
Macro
|
||||
Legacy
|
||||
|
||||
AliasTypeUnknown AliasType = -1
|
||||
)
|
|
@ -0,0 +1,29 @@
|
|||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
// CompactCoreInfo is a compact integer with the three core tags encoded.
|
||||
type CompactCoreInfo uint32
|
||||
|
||||
// GetCompactCore generates a uint32 value that is guaranteed to be unique for
|
||||
// different language, region, and script values.
|
||||
func GetCompactCore(t Tag) (cci CompactCoreInfo, ok bool) {
|
||||
if t.LangID > langNoIndexOffset {
|
||||
return 0, false
|
||||
}
|
||||
cci |= CompactCoreInfo(t.LangID) << (8 + 12)
|
||||
cci |= CompactCoreInfo(t.ScriptID) << 12
|
||||
cci |= CompactCoreInfo(t.RegionID)
|
||||
return cci, true
|
||||
}
|
||||
|
||||
// Tag generates a tag from c.
|
||||
func (c CompactCoreInfo) Tag() Tag {
|
||||
return Tag{
|
||||
LangID: Language(c >> 20),
|
||||
RegionID: Region(c & 0x3ff),
|
||||
ScriptID: Script(c>>12) & 0xff,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package compact defines a compact representation of language tags.
|
||||
//
|
||||
// Common language tags (at least all for which locale information is defined
|
||||
// in CLDR) are assigned a unique index. Each Tag is associated with such an
|
||||
// ID for selecting language-related resources (such as translations) as well
|
||||
// as one for selecting regional defaults (currency, number formatting, etc.)
|
||||
//
|
||||
// It may want to export this functionality at some point, but at this point
|
||||
// this is only available for use within x/text.
|
||||
package compact // import "golang.org/x/text/internal/language/compact"
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/language"
|
||||
)
|
||||
|
||||
// ID is an integer identifying a single tag.
|
||||
type ID uint16
|
||||
|
||||
func getCoreIndex(t language.Tag) (id ID, ok bool) {
|
||||
cci, ok := language.GetCompactCore(t)
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
i := sort.Search(len(coreTags), func(i int) bool {
|
||||
return cci <= coreTags[i]
|
||||
})
|
||||
if i == len(coreTags) || coreTags[i] != cci {
|
||||
return 0, false
|
||||
}
|
||||
return ID(i), true
|
||||
}
|
||||
|
||||
// Parent returns the ID of the parent or the root ID if id is already the root.
|
||||
func (id ID) Parent() ID {
|
||||
return parents[id]
|
||||
}
|
||||
|
||||
// Tag converts id to an internal language Tag.
|
||||
func (id ID) Tag() language.Tag {
|
||||
if int(id) >= len(coreTags) {
|
||||
return specialTags[int(id)-len(coreTags)]
|
||||
}
|
||||
return coreTags[id].Tag()
|
||||
}
|
||||
|
||||
var specialTags []language.Tag
|
||||
|
||||
func init() {
|
||||
tags := strings.Split(specialTagsStr, " ")
|
||||
specialTags = make([]language.Tag, len(tags))
|
||||
for i, t := range tags {
|
||||
specialTags[i] = language.MustParse(t)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,260 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run gen.go gen_index.go -output tables.go
|
||||
//go:generate go run gen_parents.go
|
||||
|
||||
package compact
|
||||
|
||||
// TODO: Remove above NOTE after:
|
||||
// - verifying that tables are dropped correctly (most notably matcher tables).
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/language"
|
||||
)
|
||||
|
||||
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
|
||||
// specific language or locale. All language tag values are guaranteed to be
|
||||
// well-formed.
|
||||
type Tag struct {
|
||||
// NOTE: exported tags will become part of the public API.
|
||||
language ID
|
||||
locale ID
|
||||
full fullTag // always a language.Tag for now.
|
||||
}
|
||||
|
||||
const _und = 0
|
||||
|
||||
type fullTag interface {
|
||||
IsRoot() bool
|
||||
Parent() language.Tag
|
||||
}
|
||||
|
||||
// Make a compact Tag from a fully specified internal language Tag.
|
||||
func Make(t language.Tag) (tag Tag) {
|
||||
if region := t.TypeForKey("rg"); len(region) == 6 && region[2:] == "zzzz" {
|
||||
if r, err := language.ParseRegion(region[:2]); err == nil {
|
||||
tFull := t
|
||||
t, _ = t.SetTypeForKey("rg", "")
|
||||
// TODO: should we not consider "va" for the language tag?
|
||||
var exact1, exact2 bool
|
||||
tag.language, exact1 = FromTag(t)
|
||||
t.RegionID = r
|
||||
tag.locale, exact2 = FromTag(t)
|
||||
if !exact1 || !exact2 {
|
||||
tag.full = tFull
|
||||
}
|
||||
return tag
|
||||
}
|
||||
}
|
||||
lang, ok := FromTag(t)
|
||||
tag.language = lang
|
||||
tag.locale = lang
|
||||
if !ok {
|
||||
tag.full = t
|
||||
}
|
||||
return tag
|
||||
}
|
||||
|
||||
// Tag returns an internal language Tag version of this tag.
|
||||
func (t Tag) Tag() language.Tag {
|
||||
if t.full != nil {
|
||||
return t.full.(language.Tag)
|
||||
}
|
||||
tag := t.language.Tag()
|
||||
if t.language != t.locale {
|
||||
loc := t.locale.Tag()
|
||||
tag, _ = tag.SetTypeForKey("rg", strings.ToLower(loc.RegionID.String())+"zzzz")
|
||||
}
|
||||
return tag
|
||||
}
|
||||
|
||||
// IsCompact reports whether this tag is fully defined in terms of ID.
|
||||
func (t *Tag) IsCompact() bool {
|
||||
return t.full == nil
|
||||
}
|
||||
|
||||
// MayHaveVariants reports whether a tag may have variants. If it returns false
|
||||
// it is guaranteed the tag does not have variants.
|
||||
func (t Tag) MayHaveVariants() bool {
|
||||
return t.full != nil || int(t.language) >= len(coreTags)
|
||||
}
|
||||
|
||||
// MayHaveExtensions reports whether a tag may have extensions. If it returns
|
||||
// false it is guaranteed the tag does not have them.
|
||||
func (t Tag) MayHaveExtensions() bool {
|
||||
return t.full != nil ||
|
||||
int(t.language) >= len(coreTags) ||
|
||||
t.language != t.locale
|
||||
}
|
||||
|
||||
// IsRoot returns true if t is equal to language "und".
|
||||
func (t Tag) IsRoot() bool {
|
||||
if t.full != nil {
|
||||
return t.full.IsRoot()
|
||||
}
|
||||
return t.language == _und
|
||||
}
|
||||
|
||||
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
|
||||
// specific language are substituted with fields from the parent language.
|
||||
// The parent for a language may change for newer versions of CLDR.
|
||||
func (t Tag) Parent() Tag {
|
||||
if t.full != nil {
|
||||
return Make(t.full.Parent())
|
||||
}
|
||||
if t.language != t.locale {
|
||||
// Simulate stripping -u-rg-xxxxxx
|
||||
return Tag{language: t.language, locale: t.language}
|
||||
}
|
||||
// TODO: use parent lookup table once cycle from internal package is
|
||||
// removed. Probably by internalizing the table and declaring this fast
|
||||
// enough.
|
||||
// lang := compactID(internal.Parent(uint16(t.language)))
|
||||
lang, _ := FromTag(t.language.Tag().Parent())
|
||||
return Tag{language: lang, locale: lang}
|
||||
}
|
||||
|
||||
// nextToken returns token t and the rest of the string.
|
||||
func nextToken(s string) (t, tail string) {
|
||||
p := strings.Index(s[1:], "-")
|
||||
if p == -1 {
|
||||
return s[1:], ""
|
||||
}
|
||||
p++
|
||||
return s[1:p], s[p:]
|
||||
}
|
||||
|
||||
// LanguageID returns an index, where 0 <= index < NumCompactTags, for tags
|
||||
// for which data exists in the text repository.The index will change over time
|
||||
// and should not be stored in persistent storage. If t does not match a compact
|
||||
// index, exact will be false and the compact index will be returned for the
|
||||
// first match after repeatedly taking the Parent of t.
|
||||
func LanguageID(t Tag) (id ID, exact bool) {
|
||||
return t.language, t.full == nil
|
||||
}
|
||||
|
||||
// RegionalID returns the ID for the regional variant of this tag. This index is
|
||||
// used to indicate region-specific overrides, such as default currency, default
|
||||
// calendar and week data, default time cycle, and default measurement system
|
||||
// and unit preferences.
|
||||
//
|
||||
// For instance, the tag en-GB-u-rg-uszzzz specifies British English with US
|
||||
// settings for currency, number formatting, etc. The CompactIndex for this tag
|
||||
// will be that for en-GB, while the RegionalID will be the one corresponding to
|
||||
// en-US.
|
||||
func RegionalID(t Tag) (id ID, exact bool) {
|
||||
return t.locale, t.full == nil
|
||||
}
|
||||
|
||||
// LanguageTag returns t stripped of regional variant indicators.
|
||||
//
|
||||
// At the moment this means it is stripped of a regional and variant subtag "rg"
|
||||
// and "va" in the "u" extension.
|
||||
func (t Tag) LanguageTag() Tag {
|
||||
if t.full == nil {
|
||||
return Tag{language: t.language, locale: t.language}
|
||||
}
|
||||
tt := t.Tag()
|
||||
tt.SetTypeForKey("rg", "")
|
||||
tt.SetTypeForKey("va", "")
|
||||
return Make(tt)
|
||||
}
|
||||
|
||||
// RegionalTag returns the regional variant of the tag.
|
||||
//
|
||||
// At the moment this means that the region is set from the regional subtag
|
||||
// "rg" in the "u" extension.
|
||||
func (t Tag) RegionalTag() Tag {
|
||||
rt := Tag{language: t.locale, locale: t.locale}
|
||||
if t.full == nil {
|
||||
return rt
|
||||
}
|
||||
b := language.Builder{}
|
||||
tag := t.Tag()
|
||||
// tag, _ = tag.SetTypeForKey("rg", "")
|
||||
b.SetTag(t.locale.Tag())
|
||||
if v := tag.Variants(); v != "" {
|
||||
for _, v := range strings.Split(v, "-") {
|
||||
b.AddVariant(v)
|
||||
}
|
||||
}
|
||||
for _, e := range tag.Extensions() {
|
||||
b.AddExt(e)
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
// FromTag reports closest matching ID for an internal language Tag.
|
||||
func FromTag(t language.Tag) (id ID, exact bool) {
|
||||
// TODO: perhaps give more frequent tags a lower index.
|
||||
// TODO: we could make the indexes stable. This will excluded some
|
||||
// possibilities for optimization, so don't do this quite yet.
|
||||
exact = true
|
||||
|
||||
b, s, r := t.Raw()
|
||||
if t.HasString() {
|
||||
if t.IsPrivateUse() {
|
||||
// We have no entries for user-defined tags.
|
||||
return 0, false
|
||||
}
|
||||
hasExtra := false
|
||||
if t.HasVariants() {
|
||||
if t.HasExtensions() {
|
||||
build := language.Builder{}
|
||||
build.SetTag(language.Tag{LangID: b, ScriptID: s, RegionID: r})
|
||||
build.AddVariant(t.Variants())
|
||||
exact = false
|
||||
t = build.Make()
|
||||
}
|
||||
hasExtra = true
|
||||
} else if _, ok := t.Extension('u'); ok {
|
||||
// TODO: va may mean something else. Consider not considering it.
|
||||
// Strip all but the 'va' entry.
|
||||
old := t
|
||||
variant := t.TypeForKey("va")
|
||||
t = language.Tag{LangID: b, ScriptID: s, RegionID: r}
|
||||
if variant != "" {
|
||||
t, _ = t.SetTypeForKey("va", variant)
|
||||
hasExtra = true
|
||||
}
|
||||
exact = old == t
|
||||
} else {
|
||||
exact = false
|
||||
}
|
||||
if hasExtra {
|
||||
// We have some variants.
|
||||
for i, s := range specialTags {
|
||||
if s == t {
|
||||
return ID(i + len(coreTags)), exact
|
||||
}
|
||||
}
|
||||
exact = false
|
||||
}
|
||||
}
|
||||
if x, ok := getCoreIndex(t); ok {
|
||||
return x, exact
|
||||
}
|
||||
exact = false
|
||||
if r != 0 && s == 0 {
|
||||
// Deal with cases where an extra script is inserted for the region.
|
||||
t, _ := t.Maximize()
|
||||
if x, ok := getCoreIndex(t); ok {
|
||||
return x, exact
|
||||
}
|
||||
}
|
||||
for t = t.Parent(); t != root; t = t.Parent() {
|
||||
// No variants specified: just compare core components.
|
||||
// The key has the form lllssrrr, where l, s, and r are nibbles for
|
||||
// respectively the langID, scriptID, and regionID.
|
||||
if x, ok := getCoreIndex(t); ok {
|
||||
return x, exact
|
||||
}
|
||||
}
|
||||
return 0, exact
|
||||
}
|
||||
|
||||
var root = language.Tag{}
|
|
@ -0,0 +1,120 @@
|
|||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||
|
||||
package compact
|
||||
|
||||
// parents maps a compact index of a tag to the compact index of the parent of
|
||||
// this tag.
|
||||
var parents = []ID{ // 775 elements
|
||||
// Entry 0 - 3F
|
||||
0x0000, 0x0000, 0x0001, 0x0001, 0x0000, 0x0004, 0x0000, 0x0006,
|
||||
0x0000, 0x0008, 0x0000, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a,
|
||||
0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a,
|
||||
0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a,
|
||||
0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x0000,
|
||||
0x0000, 0x0028, 0x0000, 0x002a, 0x0000, 0x002c, 0x0000, 0x0000,
|
||||
0x002f, 0x002e, 0x002e, 0x0000, 0x0033, 0x0000, 0x0035, 0x0000,
|
||||
0x0037, 0x0000, 0x0039, 0x0000, 0x003b, 0x0000, 0x0000, 0x003e,
|
||||
// Entry 40 - 7F
|
||||
0x0000, 0x0040, 0x0040, 0x0000, 0x0043, 0x0043, 0x0000, 0x0046,
|
||||
0x0000, 0x0048, 0x0000, 0x0000, 0x004b, 0x004a, 0x004a, 0x0000,
|
||||
0x004f, 0x004f, 0x004f, 0x004f, 0x0000, 0x0054, 0x0054, 0x0000,
|
||||
0x0057, 0x0000, 0x0059, 0x0000, 0x005b, 0x0000, 0x005d, 0x005d,
|
||||
0x0000, 0x0060, 0x0000, 0x0062, 0x0000, 0x0064, 0x0000, 0x0066,
|
||||
0x0066, 0x0000, 0x0069, 0x0000, 0x006b, 0x006b, 0x006b, 0x006b,
|
||||
0x006b, 0x006b, 0x006b, 0x0000, 0x0073, 0x0000, 0x0075, 0x0000,
|
||||
0x0077, 0x0000, 0x0000, 0x007a, 0x0000, 0x007c, 0x0000, 0x007e,
|
||||
// Entry 80 - BF
|
||||
0x0000, 0x0080, 0x0080, 0x0000, 0x0083, 0x0083, 0x0000, 0x0086,
|
||||
0x0087, 0x0087, 0x0087, 0x0086, 0x0088, 0x0087, 0x0087, 0x0087,
|
||||
0x0086, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0088,
|
||||
0x0087, 0x0087, 0x0087, 0x0087, 0x0088, 0x0087, 0x0088, 0x0087,
|
||||
0x0087, 0x0088, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
|
||||
0x0087, 0x0087, 0x0087, 0x0086, 0x0087, 0x0087, 0x0087, 0x0087,
|
||||
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
|
||||
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0086, 0x0087, 0x0086,
|
||||
// Entry C0 - FF
|
||||
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
|
||||
0x0088, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
|
||||
0x0086, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0088, 0x0087,
|
||||
0x0087, 0x0088, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
|
||||
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0086, 0x0086, 0x0087,
|
||||
0x0087, 0x0086, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0000,
|
||||
0x00ef, 0x0000, 0x00f1, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f2,
|
||||
0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f1, 0x00f2, 0x00f1, 0x00f1,
|
||||
// Entry 100 - 13F
|
||||
0x00f2, 0x00f2, 0x00f1, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f1,
|
||||
0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x0000, 0x010e,
|
||||
0x0000, 0x0110, 0x0000, 0x0112, 0x0000, 0x0114, 0x0114, 0x0000,
|
||||
0x0117, 0x0117, 0x0117, 0x0117, 0x0000, 0x011c, 0x0000, 0x011e,
|
||||
0x0000, 0x0120, 0x0120, 0x0000, 0x0123, 0x0123, 0x0123, 0x0123,
|
||||
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
|
||||
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
|
||||
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
|
||||
// Entry 140 - 17F
|
||||
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
|
||||
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
|
||||
0x0123, 0x0123, 0x0000, 0x0152, 0x0000, 0x0154, 0x0000, 0x0156,
|
||||
0x0000, 0x0158, 0x0000, 0x015a, 0x0000, 0x015c, 0x015c, 0x015c,
|
||||
0x0000, 0x0160, 0x0000, 0x0000, 0x0163, 0x0000, 0x0165, 0x0000,
|
||||
0x0167, 0x0167, 0x0167, 0x0000, 0x016b, 0x0000, 0x016d, 0x0000,
|
||||
0x016f, 0x0000, 0x0171, 0x0171, 0x0000, 0x0174, 0x0000, 0x0176,
|
||||
0x0000, 0x0178, 0x0000, 0x017a, 0x0000, 0x017c, 0x0000, 0x017e,
|
||||
// Entry 180 - 1BF
|
||||
0x0000, 0x0000, 0x0000, 0x0182, 0x0000, 0x0184, 0x0184, 0x0184,
|
||||
0x0184, 0x0000, 0x0000, 0x0000, 0x018b, 0x0000, 0x0000, 0x018e,
|
||||
0x0000, 0x0000, 0x0191, 0x0000, 0x0000, 0x0000, 0x0195, 0x0000,
|
||||
0x0197, 0x0000, 0x0000, 0x019a, 0x0000, 0x0000, 0x019d, 0x0000,
|
||||
0x019f, 0x0000, 0x01a1, 0x0000, 0x01a3, 0x0000, 0x01a5, 0x0000,
|
||||
0x01a7, 0x0000, 0x01a9, 0x0000, 0x01ab, 0x0000, 0x01ad, 0x0000,
|
||||
0x01af, 0x0000, 0x01b1, 0x01b1, 0x0000, 0x01b4, 0x0000, 0x01b6,
|
||||
0x0000, 0x01b8, 0x0000, 0x01ba, 0x0000, 0x01bc, 0x0000, 0x0000,
|
||||
// Entry 1C0 - 1FF
|
||||
0x01bf, 0x0000, 0x01c1, 0x0000, 0x01c3, 0x0000, 0x01c5, 0x0000,
|
||||
0x01c7, 0x0000, 0x01c9, 0x0000, 0x01cb, 0x01cb, 0x01cb, 0x01cb,
|
||||
0x0000, 0x01d0, 0x0000, 0x01d2, 0x01d2, 0x0000, 0x01d5, 0x0000,
|
||||
0x01d7, 0x0000, 0x01d9, 0x0000, 0x01db, 0x0000, 0x01dd, 0x0000,
|
||||
0x01df, 0x01df, 0x0000, 0x01e2, 0x0000, 0x01e4, 0x0000, 0x01e6,
|
||||
0x0000, 0x01e8, 0x0000, 0x01ea, 0x0000, 0x01ec, 0x0000, 0x01ee,
|
||||
0x0000, 0x01f0, 0x0000, 0x0000, 0x01f3, 0x0000, 0x01f5, 0x01f5,
|
||||
0x01f5, 0x0000, 0x01f9, 0x0000, 0x01fb, 0x0000, 0x01fd, 0x0000,
|
||||
// Entry 200 - 23F
|
||||
0x01ff, 0x0000, 0x0000, 0x0202, 0x0000, 0x0204, 0x0204, 0x0000,
|
||||
0x0207, 0x0000, 0x0209, 0x0209, 0x0000, 0x020c, 0x020c, 0x0000,
|
||||
0x020f, 0x020f, 0x020f, 0x020f, 0x020f, 0x020f, 0x020f, 0x0000,
|
||||
0x0217, 0x0000, 0x0219, 0x0000, 0x021b, 0x0000, 0x0000, 0x0000,
|
||||
0x0000, 0x0000, 0x0221, 0x0000, 0x0000, 0x0224, 0x0000, 0x0226,
|
||||
0x0226, 0x0000, 0x0229, 0x0000, 0x022b, 0x022b, 0x0000, 0x0000,
|
||||
0x022f, 0x022e, 0x022e, 0x0000, 0x0000, 0x0234, 0x0000, 0x0236,
|
||||
0x0000, 0x0238, 0x0000, 0x0244, 0x023a, 0x0244, 0x0244, 0x0244,
|
||||
// Entry 240 - 27F
|
||||
0x0244, 0x0244, 0x0244, 0x0244, 0x023a, 0x0244, 0x0244, 0x0000,
|
||||
0x0247, 0x0247, 0x0247, 0x0000, 0x024b, 0x0000, 0x024d, 0x0000,
|
||||
0x024f, 0x024f, 0x0000, 0x0252, 0x0000, 0x0254, 0x0254, 0x0254,
|
||||
0x0254, 0x0254, 0x0254, 0x0000, 0x025b, 0x0000, 0x025d, 0x0000,
|
||||
0x025f, 0x0000, 0x0261, 0x0000, 0x0263, 0x0000, 0x0265, 0x0000,
|
||||
0x0000, 0x0268, 0x0268, 0x0268, 0x0000, 0x026c, 0x0000, 0x026e,
|
||||
0x0000, 0x0270, 0x0000, 0x0000, 0x0000, 0x0274, 0x0273, 0x0273,
|
||||
0x0000, 0x0278, 0x0000, 0x027a, 0x0000, 0x027c, 0x0000, 0x0000,
|
||||
// Entry 280 - 2BF
|
||||
0x0000, 0x0000, 0x0281, 0x0000, 0x0000, 0x0284, 0x0000, 0x0286,
|
||||
0x0286, 0x0286, 0x0286, 0x0000, 0x028b, 0x028b, 0x028b, 0x0000,
|
||||
0x028f, 0x028f, 0x028f, 0x028f, 0x028f, 0x0000, 0x0295, 0x0295,
|
||||
0x0295, 0x0295, 0x0000, 0x0000, 0x0000, 0x0000, 0x029d, 0x029d,
|
||||
0x029d, 0x0000, 0x02a1, 0x02a1, 0x02a1, 0x02a1, 0x0000, 0x0000,
|
||||
0x02a7, 0x02a7, 0x02a7, 0x02a7, 0x0000, 0x02ac, 0x0000, 0x02ae,
|
||||
0x02ae, 0x0000, 0x02b1, 0x0000, 0x02b3, 0x0000, 0x02b5, 0x02b5,
|
||||
0x0000, 0x0000, 0x02b9, 0x0000, 0x0000, 0x0000, 0x02bd, 0x0000,
|
||||
// Entry 2C0 - 2FF
|
||||
0x02bf, 0x02bf, 0x0000, 0x0000, 0x02c3, 0x0000, 0x02c5, 0x0000,
|
||||
0x02c7, 0x0000, 0x02c9, 0x0000, 0x02cb, 0x0000, 0x02cd, 0x02cd,
|
||||
0x0000, 0x0000, 0x02d1, 0x0000, 0x02d3, 0x02d0, 0x02d0, 0x0000,
|
||||
0x0000, 0x02d8, 0x02d7, 0x02d7, 0x0000, 0x0000, 0x02dd, 0x0000,
|
||||
0x02df, 0x0000, 0x02e1, 0x0000, 0x0000, 0x02e4, 0x0000, 0x02e6,
|
||||
0x0000, 0x0000, 0x02e9, 0x0000, 0x02eb, 0x0000, 0x02ed, 0x0000,
|
||||
0x02ef, 0x02ef, 0x0000, 0x0000, 0x02f3, 0x02f2, 0x02f2, 0x0000,
|
||||
0x02f7, 0x0000, 0x02f9, 0x02f9, 0x02f9, 0x02f9, 0x02f9, 0x0000,
|
||||
// Entry 300 - 33F
|
||||
0x02ff, 0x0300, 0x02ff, 0x0000, 0x0303, 0x0051, 0x00e6,
|
||||
} // Size: 1574 bytes
|
||||
|
||||
// Total table size 1574 bytes (1KiB); checksum: 895AAF0B
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,91 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package compact
|
||||
|
||||
var (
|
||||
und = Tag{}
|
||||
|
||||
Und Tag = Tag{}
|
||||
|
||||
Afrikaans Tag = Tag{language: afIndex, locale: afIndex}
|
||||
Amharic Tag = Tag{language: amIndex, locale: amIndex}
|
||||
Arabic Tag = Tag{language: arIndex, locale: arIndex}
|
||||
ModernStandardArabic Tag = Tag{language: ar001Index, locale: ar001Index}
|
||||
Azerbaijani Tag = Tag{language: azIndex, locale: azIndex}
|
||||
Bulgarian Tag = Tag{language: bgIndex, locale: bgIndex}
|
||||
Bengali Tag = Tag{language: bnIndex, locale: bnIndex}
|
||||
Catalan Tag = Tag{language: caIndex, locale: caIndex}
|
||||
Czech Tag = Tag{language: csIndex, locale: csIndex}
|
||||
Danish Tag = Tag{language: daIndex, locale: daIndex}
|
||||
German Tag = Tag{language: deIndex, locale: deIndex}
|
||||
Greek Tag = Tag{language: elIndex, locale: elIndex}
|
||||
English Tag = Tag{language: enIndex, locale: enIndex}
|
||||
AmericanEnglish Tag = Tag{language: enUSIndex, locale: enUSIndex}
|
||||
BritishEnglish Tag = Tag{language: enGBIndex, locale: enGBIndex}
|
||||
Spanish Tag = Tag{language: esIndex, locale: esIndex}
|
||||
EuropeanSpanish Tag = Tag{language: esESIndex, locale: esESIndex}
|
||||
LatinAmericanSpanish Tag = Tag{language: es419Index, locale: es419Index}
|
||||
Estonian Tag = Tag{language: etIndex, locale: etIndex}
|
||||
Persian Tag = Tag{language: faIndex, locale: faIndex}
|
||||
Finnish Tag = Tag{language: fiIndex, locale: fiIndex}
|
||||
Filipino Tag = Tag{language: filIndex, locale: filIndex}
|
||||
French Tag = Tag{language: frIndex, locale: frIndex}
|
||||
CanadianFrench Tag = Tag{language: frCAIndex, locale: frCAIndex}
|
||||
Gujarati Tag = Tag{language: guIndex, locale: guIndex}
|
||||
Hebrew Tag = Tag{language: heIndex, locale: heIndex}
|
||||
Hindi Tag = Tag{language: hiIndex, locale: hiIndex}
|
||||
Croatian Tag = Tag{language: hrIndex, locale: hrIndex}
|
||||
Hungarian Tag = Tag{language: huIndex, locale: huIndex}
|
||||
Armenian Tag = Tag{language: hyIndex, locale: hyIndex}
|
||||
Indonesian Tag = Tag{language: idIndex, locale: idIndex}
|
||||
Icelandic Tag = Tag{language: isIndex, locale: isIndex}
|
||||
Italian Tag = Tag{language: itIndex, locale: itIndex}
|
||||
Japanese Tag = Tag{language: jaIndex, locale: jaIndex}
|
||||
Georgian Tag = Tag{language: kaIndex, locale: kaIndex}
|
||||
Kazakh Tag = Tag{language: kkIndex, locale: kkIndex}
|
||||
Khmer Tag = Tag{language: kmIndex, locale: kmIndex}
|
||||
Kannada Tag = Tag{language: knIndex, locale: knIndex}
|
||||
Korean Tag = Tag{language: koIndex, locale: koIndex}
|
||||
Kirghiz Tag = Tag{language: kyIndex, locale: kyIndex}
|
||||
Lao Tag = Tag{language: loIndex, locale: loIndex}
|
||||
Lithuanian Tag = Tag{language: ltIndex, locale: ltIndex}
|
||||
Latvian Tag = Tag{language: lvIndex, locale: lvIndex}
|
||||
Macedonian Tag = Tag{language: mkIndex, locale: mkIndex}
|
||||
Malayalam Tag = Tag{language: mlIndex, locale: mlIndex}
|
||||
Mongolian Tag = Tag{language: mnIndex, locale: mnIndex}
|
||||
Marathi Tag = Tag{language: mrIndex, locale: mrIndex}
|
||||
Malay Tag = Tag{language: msIndex, locale: msIndex}
|
||||
Burmese Tag = Tag{language: myIndex, locale: myIndex}
|
||||
Nepali Tag = Tag{language: neIndex, locale: neIndex}
|
||||
Dutch Tag = Tag{language: nlIndex, locale: nlIndex}
|
||||
Norwegian Tag = Tag{language: noIndex, locale: noIndex}
|
||||
Punjabi Tag = Tag{language: paIndex, locale: paIndex}
|
||||
Polish Tag = Tag{language: plIndex, locale: plIndex}
|
||||
Portuguese Tag = Tag{language: ptIndex, locale: ptIndex}
|
||||
BrazilianPortuguese Tag = Tag{language: ptBRIndex, locale: ptBRIndex}
|
||||
EuropeanPortuguese Tag = Tag{language: ptPTIndex, locale: ptPTIndex}
|
||||
Romanian Tag = Tag{language: roIndex, locale: roIndex}
|
||||
Russian Tag = Tag{language: ruIndex, locale: ruIndex}
|
||||
Sinhala Tag = Tag{language: siIndex, locale: siIndex}
|
||||
Slovak Tag = Tag{language: skIndex, locale: skIndex}
|
||||
Slovenian Tag = Tag{language: slIndex, locale: slIndex}
|
||||
Albanian Tag = Tag{language: sqIndex, locale: sqIndex}
|
||||
Serbian Tag = Tag{language: srIndex, locale: srIndex}
|
||||
SerbianLatin Tag = Tag{language: srLatnIndex, locale: srLatnIndex}
|
||||
Swedish Tag = Tag{language: svIndex, locale: svIndex}
|
||||
Swahili Tag = Tag{language: swIndex, locale: swIndex}
|
||||
Tamil Tag = Tag{language: taIndex, locale: taIndex}
|
||||
Telugu Tag = Tag{language: teIndex, locale: teIndex}
|
||||
Thai Tag = Tag{language: thIndex, locale: thIndex}
|
||||
Turkish Tag = Tag{language: trIndex, locale: trIndex}
|
||||
Ukrainian Tag = Tag{language: ukIndex, locale: ukIndex}
|
||||
Urdu Tag = Tag{language: urIndex, locale: urIndex}
|
||||
Uzbek Tag = Tag{language: uzIndex, locale: uzIndex}
|
||||
Vietnamese Tag = Tag{language: viIndex, locale: viIndex}
|
||||
Chinese Tag = Tag{language: zhIndex, locale: zhIndex}
|
||||
SimplifiedChinese Tag = Tag{language: zhHansIndex, locale: zhHansIndex}
|
||||
TraditionalChinese Tag = Tag{language: zhHantIndex, locale: zhHantIndex}
|
||||
Zulu Tag = Tag{language: zuIndex, locale: zuIndex}
|
||||
)
|
|
@ -0,0 +1,167 @@
|
|||
// Copyright 2018 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// A Builder allows constructing a Tag from individual components.
|
||||
// Its main user is Compose in the top-level language package.
|
||||
type Builder struct {
|
||||
Tag Tag
|
||||
|
||||
private string // the x extension
|
||||
variants []string
|
||||
extensions []string
|
||||
}
|
||||
|
||||
// Make returns a new Tag from the current settings.
|
||||
func (b *Builder) Make() Tag {
|
||||
t := b.Tag
|
||||
|
||||
if len(b.extensions) > 0 || len(b.variants) > 0 {
|
||||
sort.Sort(sortVariants(b.variants))
|
||||
sort.Strings(b.extensions)
|
||||
|
||||
if b.private != "" {
|
||||
b.extensions = append(b.extensions, b.private)
|
||||
}
|
||||
n := maxCoreSize + tokenLen(b.variants...) + tokenLen(b.extensions...)
|
||||
buf := make([]byte, n)
|
||||
p := t.genCoreBytes(buf)
|
||||
t.pVariant = byte(p)
|
||||
p += appendTokens(buf[p:], b.variants...)
|
||||
t.pExt = uint16(p)
|
||||
p += appendTokens(buf[p:], b.extensions...)
|
||||
t.str = string(buf[:p])
|
||||
// We may not always need to remake the string, but when or when not
|
||||
// to do so is rather tricky.
|
||||
scan := makeScanner(buf[:p])
|
||||
t, _ = parse(&scan, "")
|
||||
return t
|
||||
|
||||
} else if b.private != "" {
|
||||
t.str = b.private
|
||||
t.RemakeString()
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
// SetTag copies all the settings from a given Tag. Any previously set values
|
||||
// are discarded.
|
||||
func (b *Builder) SetTag(t Tag) {
|
||||
b.Tag.LangID = t.LangID
|
||||
b.Tag.RegionID = t.RegionID
|
||||
b.Tag.ScriptID = t.ScriptID
|
||||
// TODO: optimize
|
||||
b.variants = b.variants[:0]
|
||||
if variants := t.Variants(); variants != "" {
|
||||
for _, vr := range strings.Split(variants[1:], "-") {
|
||||
b.variants = append(b.variants, vr)
|
||||
}
|
||||
}
|
||||
b.extensions, b.private = b.extensions[:0], ""
|
||||
for _, e := range t.Extensions() {
|
||||
b.AddExt(e)
|
||||
}
|
||||
}
|
||||
|
||||
// AddExt adds extension e to the tag. e must be a valid extension as returned
|
||||
// by Tag.Extension. If the extension already exists, it will be discarded,
|
||||
// except for a -u extension, where non-existing key-type pairs will added.
|
||||
func (b *Builder) AddExt(e string) {
|
||||
if e[0] == 'x' {
|
||||
if b.private == "" {
|
||||
b.private = e
|
||||
}
|
||||
return
|
||||
}
|
||||
for i, s := range b.extensions {
|
||||
if s[0] == e[0] {
|
||||
if e[0] == 'u' {
|
||||
b.extensions[i] += e[1:]
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
b.extensions = append(b.extensions, e)
|
||||
}
|
||||
|
||||
// SetExt sets the extension e to the tag. e must be a valid extension as
|
||||
// returned by Tag.Extension. If the extension already exists, it will be
|
||||
// overwritten, except for a -u extension, where the individual key-type pairs
|
||||
// will be set.
|
||||
func (b *Builder) SetExt(e string) {
|
||||
if e[0] == 'x' {
|
||||
b.private = e
|
||||
return
|
||||
}
|
||||
for i, s := range b.extensions {
|
||||
if s[0] == e[0] {
|
||||
if e[0] == 'u' {
|
||||
b.extensions[i] = e + s[1:]
|
||||
} else {
|
||||
b.extensions[i] = e
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
b.extensions = append(b.extensions, e)
|
||||
}
|
||||
|
||||
// AddVariant adds any number of variants.
|
||||
func (b *Builder) AddVariant(v ...string) {
|
||||
for _, v := range v {
|
||||
if v != "" {
|
||||
b.variants = append(b.variants, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ClearVariants removes any variants previously added, including those
|
||||
// copied from a Tag in SetTag.
|
||||
func (b *Builder) ClearVariants() {
|
||||
b.variants = b.variants[:0]
|
||||
}
|
||||
|
||||
// ClearExtensions removes any extensions previously added, including those
|
||||
// copied from a Tag in SetTag.
|
||||
func (b *Builder) ClearExtensions() {
|
||||
b.private = ""
|
||||
b.extensions = b.extensions[:0]
|
||||
}
|
||||
|
||||
func tokenLen(token ...string) (n int) {
|
||||
for _, t := range token {
|
||||
n += len(t) + 1
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func appendTokens(b []byte, token ...string) int {
|
||||
p := 0
|
||||
for _, t := range token {
|
||||
b[p] = '-'
|
||||
copy(b[p+1:], t)
|
||||
p += 1 + len(t)
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
type sortVariants []string
|
||||
|
||||
func (s sortVariants) Len() int {
|
||||
return len(s)
|
||||
}
|
||||
|
||||
func (s sortVariants) Swap(i, j int) {
|
||||
s[j], s[i] = s[i], s[j]
|
||||
}
|
||||
|
||||
func (s sortVariants) Less(i, j int) bool {
|
||||
return variantIndex[s[i]] < variantIndex[s[j]]
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
// BaseLanguages returns the list of all supported base languages. It generates
|
||||
// the list by traversing the internal structures.
|
||||
func BaseLanguages() []Language {
|
||||
base := make([]Language, 0, NumLanguages)
|
||||
for i := 0; i < langNoIndexOffset; i++ {
|
||||
// We included "und" already for the value 0.
|
||||
if i != nonCanonicalUnd {
|
||||
base = append(base, Language(i))
|
||||
}
|
||||
}
|
||||
i := langNoIndexOffset
|
||||
for _, v := range langNoIndex {
|
||||
for k := 0; k < 8; k++ {
|
||||
if v&1 == 1 {
|
||||
base = append(base, Language(i))
|
||||
}
|
||||
v >>= 1
|
||||
i++
|
||||
}
|
||||
}
|
||||
return base
|
||||
}
|
|
@ -0,0 +1,627 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run gen.go gen_common.go -output tables.go
|
||||
|
||||
package language // import "golang.org/x/text/internal/language"
|
||||
|
||||
// TODO: Remove above NOTE after:
|
||||
// - verifying that tables are dropped correctly (most notably matcher tables).
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
// maxCoreSize is the maximum size of a BCP 47 tag without variants and
|
||||
// extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
|
||||
maxCoreSize = 12
|
||||
|
||||
// max99thPercentileSize is a somewhat arbitrary buffer size that presumably
|
||||
// is large enough to hold at least 99% of the BCP 47 tags.
|
||||
max99thPercentileSize = 32
|
||||
|
||||
// maxSimpleUExtensionSize is the maximum size of a -u extension with one
|
||||
// key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
|
||||
maxSimpleUExtensionSize = 14
|
||||
)
|
||||
|
||||
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
|
||||
// specific language or locale. All language tag values are guaranteed to be
|
||||
// well-formed. The zero value of Tag is Und.
|
||||
type Tag struct {
|
||||
// TODO: the following fields have the form TagTypeID. This name is chosen
|
||||
// to allow refactoring the public package without conflicting with its
|
||||
// Base, Script, and Region methods. Once the transition is fully completed
|
||||
// the ID can be stripped from the name.
|
||||
|
||||
LangID Language
|
||||
RegionID Region
|
||||
// TODO: we will soon run out of positions for ScriptID. Idea: instead of
|
||||
// storing lang, region, and ScriptID codes, store only the compact index and
|
||||
// have a lookup table from this code to its expansion. This greatly speeds
|
||||
// up table lookup, speed up common variant cases.
|
||||
// This will also immediately free up 3 extra bytes. Also, the pVariant
|
||||
// field can now be moved to the lookup table, as the compact index uniquely
|
||||
// determines the offset of a possible variant.
|
||||
ScriptID Script
|
||||
pVariant byte // offset in str, includes preceding '-'
|
||||
pExt uint16 // offset of first extension, includes preceding '-'
|
||||
|
||||
// str is the string representation of the Tag. It will only be used if the
|
||||
// tag has variants or extensions.
|
||||
str string
|
||||
}
|
||||
|
||||
// Make is a convenience wrapper for Parse that omits the error.
|
||||
// In case of an error, a sensible default is returned.
|
||||
func Make(s string) Tag {
|
||||
t, _ := Parse(s)
|
||||
return t
|
||||
}
|
||||
|
||||
// Raw returns the raw base language, script and region, without making an
|
||||
// attempt to infer their values.
|
||||
// TODO: consider removing
|
||||
func (t Tag) Raw() (b Language, s Script, r Region) {
|
||||
return t.LangID, t.ScriptID, t.RegionID
|
||||
}
|
||||
|
||||
// equalTags compares language, script and region subtags only.
|
||||
func (t Tag) equalTags(a Tag) bool {
|
||||
return t.LangID == a.LangID && t.ScriptID == a.ScriptID && t.RegionID == a.RegionID
|
||||
}
|
||||
|
||||
// IsRoot returns true if t is equal to language "und".
|
||||
func (t Tag) IsRoot() bool {
|
||||
if int(t.pVariant) < len(t.str) {
|
||||
return false
|
||||
}
|
||||
return t.equalTags(Und)
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether the Tag consists solely of an IsPrivateUse use
|
||||
// tag.
|
||||
func (t Tag) IsPrivateUse() bool {
|
||||
return t.str != "" && t.pVariant == 0
|
||||
}
|
||||
|
||||
// RemakeString is used to update t.str in case lang, script or region changed.
|
||||
// It is assumed that pExt and pVariant still point to the start of the
|
||||
// respective parts.
|
||||
func (t *Tag) RemakeString() {
|
||||
if t.str == "" {
|
||||
return
|
||||
}
|
||||
extra := t.str[t.pVariant:]
|
||||
if t.pVariant > 0 {
|
||||
extra = extra[1:]
|
||||
}
|
||||
if t.equalTags(Und) && strings.HasPrefix(extra, "x-") {
|
||||
t.str = extra
|
||||
t.pVariant = 0
|
||||
t.pExt = 0
|
||||
return
|
||||
}
|
||||
var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
|
||||
b := buf[:t.genCoreBytes(buf[:])]
|
||||
if extra != "" {
|
||||
diff := len(b) - int(t.pVariant)
|
||||
b = append(b, '-')
|
||||
b = append(b, extra...)
|
||||
t.pVariant = uint8(int(t.pVariant) + diff)
|
||||
t.pExt = uint16(int(t.pExt) + diff)
|
||||
} else {
|
||||
t.pVariant = uint8(len(b))
|
||||
t.pExt = uint16(len(b))
|
||||
}
|
||||
t.str = string(b)
|
||||
}
|
||||
|
||||
// genCoreBytes writes a string for the base languages, script and region tags
|
||||
// to the given buffer and returns the number of bytes written. It will never
|
||||
// write more than maxCoreSize bytes.
|
||||
func (t *Tag) genCoreBytes(buf []byte) int {
|
||||
n := t.LangID.StringToBuf(buf[:])
|
||||
if t.ScriptID != 0 {
|
||||
n += copy(buf[n:], "-")
|
||||
n += copy(buf[n:], t.ScriptID.String())
|
||||
}
|
||||
if t.RegionID != 0 {
|
||||
n += copy(buf[n:], "-")
|
||||
n += copy(buf[n:], t.RegionID.String())
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// String returns the canonical string representation of the language tag.
|
||||
func (t Tag) String() string {
|
||||
if t.str != "" {
|
||||
return t.str
|
||||
}
|
||||
if t.ScriptID == 0 && t.RegionID == 0 {
|
||||
return t.LangID.String()
|
||||
}
|
||||
buf := [maxCoreSize]byte{}
|
||||
return string(buf[:t.genCoreBytes(buf[:])])
|
||||
}
|
||||
|
||||
// MarshalText implements encoding.TextMarshaler.
|
||||
func (t Tag) MarshalText() (text []byte, err error) {
|
||||
if t.str != "" {
|
||||
text = append(text, t.str...)
|
||||
} else if t.ScriptID == 0 && t.RegionID == 0 {
|
||||
text = append(text, t.LangID.String()...)
|
||||
} else {
|
||||
buf := [maxCoreSize]byte{}
|
||||
text = buf[:t.genCoreBytes(buf[:])]
|
||||
}
|
||||
return text, nil
|
||||
}
|
||||
|
||||
// UnmarshalText implements encoding.TextUnmarshaler.
|
||||
func (t *Tag) UnmarshalText(text []byte) error {
|
||||
tag, err := Parse(string(text))
|
||||
*t = tag
|
||||
return err
|
||||
}
|
||||
|
||||
// Variants returns the part of the tag holding all variants or the empty string
|
||||
// if there are no variants defined.
|
||||
func (t Tag) Variants() string {
|
||||
if t.pVariant == 0 {
|
||||
return ""
|
||||
}
|
||||
return t.str[t.pVariant:t.pExt]
|
||||
}
|
||||
|
||||
// VariantOrPrivateUseTags returns variants or private use tags.
|
||||
func (t Tag) VariantOrPrivateUseTags() string {
|
||||
if t.pExt > 0 {
|
||||
return t.str[t.pVariant:t.pExt]
|
||||
}
|
||||
return t.str[t.pVariant:]
|
||||
}
|
||||
|
||||
// HasString reports whether this tag defines more than just the raw
|
||||
// components.
|
||||
func (t Tag) HasString() bool {
|
||||
return t.str != ""
|
||||
}
|
||||
|
||||
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
|
||||
// specific language are substituted with fields from the parent language.
|
||||
// The parent for a language may change for newer versions of CLDR.
|
||||
func (t Tag) Parent() Tag {
|
||||
if t.str != "" {
|
||||
// Strip the variants and extensions.
|
||||
b, s, r := t.Raw()
|
||||
t = Tag{LangID: b, ScriptID: s, RegionID: r}
|
||||
if t.RegionID == 0 && t.ScriptID != 0 && t.LangID != 0 {
|
||||
base, _ := addTags(Tag{LangID: t.LangID})
|
||||
if base.ScriptID == t.ScriptID {
|
||||
return Tag{LangID: t.LangID}
|
||||
}
|
||||
}
|
||||
return t
|
||||
}
|
||||
if t.LangID != 0 {
|
||||
if t.RegionID != 0 {
|
||||
maxScript := t.ScriptID
|
||||
if maxScript == 0 {
|
||||
max, _ := addTags(t)
|
||||
maxScript = max.ScriptID
|
||||
}
|
||||
|
||||
for i := range parents {
|
||||
if Language(parents[i].lang) == t.LangID && Script(parents[i].maxScript) == maxScript {
|
||||
for _, r := range parents[i].fromRegion {
|
||||
if Region(r) == t.RegionID {
|
||||
return Tag{
|
||||
LangID: t.LangID,
|
||||
ScriptID: Script(parents[i].script),
|
||||
RegionID: Region(parents[i].toRegion),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Strip the script if it is the default one.
|
||||
base, _ := addTags(Tag{LangID: t.LangID})
|
||||
if base.ScriptID != maxScript {
|
||||
return Tag{LangID: t.LangID, ScriptID: maxScript}
|
||||
}
|
||||
return Tag{LangID: t.LangID}
|
||||
} else if t.ScriptID != 0 {
|
||||
// The parent for an base-script pair with a non-default script is
|
||||
// "und" instead of the base language.
|
||||
base, _ := addTags(Tag{LangID: t.LangID})
|
||||
if base.ScriptID != t.ScriptID {
|
||||
return Und
|
||||
}
|
||||
return Tag{LangID: t.LangID}
|
||||
}
|
||||
}
|
||||
return Und
|
||||
}
|
||||
|
||||
// ParseExtension parses s as an extension and returns it on success.
|
||||
func ParseExtension(s string) (ext string, err error) {
|
||||
defer func() {
|
||||
if recover() != nil {
|
||||
ext = ""
|
||||
err = ErrSyntax
|
||||
}
|
||||
}()
|
||||
|
||||
scan := makeScannerString(s)
|
||||
var end int
|
||||
if n := len(scan.token); n != 1 {
|
||||
return "", ErrSyntax
|
||||
}
|
||||
scan.toLower(0, len(scan.b))
|
||||
end = parseExtension(&scan)
|
||||
if end != len(s) {
|
||||
return "", ErrSyntax
|
||||
}
|
||||
return string(scan.b), nil
|
||||
}
|
||||
|
||||
// HasVariants reports whether t has variants.
|
||||
func (t Tag) HasVariants() bool {
|
||||
return uint16(t.pVariant) < t.pExt
|
||||
}
|
||||
|
||||
// HasExtensions reports whether t has extensions.
|
||||
func (t Tag) HasExtensions() bool {
|
||||
return int(t.pExt) < len(t.str)
|
||||
}
|
||||
|
||||
// Extension returns the extension of type x for tag t. It will return
|
||||
// false for ok if t does not have the requested extension. The returned
|
||||
// extension will be invalid in this case.
|
||||
func (t Tag) Extension(x byte) (ext string, ok bool) {
|
||||
for i := int(t.pExt); i < len(t.str)-1; {
|
||||
var ext string
|
||||
i, ext = getExtension(t.str, i)
|
||||
if ext[0] == x {
|
||||
return ext, true
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
// Extensions returns all extensions of t.
|
||||
func (t Tag) Extensions() []string {
|
||||
e := []string{}
|
||||
for i := int(t.pExt); i < len(t.str)-1; {
|
||||
var ext string
|
||||
i, ext = getExtension(t.str, i)
|
||||
e = append(e, ext)
|
||||
}
|
||||
return e
|
||||
}
|
||||
|
||||
// TypeForKey returns the type associated with the given key, where key and type
|
||||
// are of the allowed values defined for the Unicode locale extension ('u') in
|
||||
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// TypeForKey will traverse the inheritance chain to get the correct value.
|
||||
//
|
||||
// If there are multiple types associated with a key, only the first will be
|
||||
// returned. If there is no type associated with a key, it returns the empty
|
||||
// string.
|
||||
func (t Tag) TypeForKey(key string) string {
|
||||
if _, start, end, _ := t.findTypeForKey(key); end != start {
|
||||
s := t.str[start:end]
|
||||
if p := strings.IndexByte(s, '-'); p >= 0 {
|
||||
s = s[:p]
|
||||
}
|
||||
return s
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
var (
|
||||
errPrivateUse = errors.New("cannot set a key on a private use tag")
|
||||
errInvalidArguments = errors.New("invalid key or type")
|
||||
)
|
||||
|
||||
// SetTypeForKey returns a new Tag with the key set to type, where key and type
|
||||
// are of the allowed values defined for the Unicode locale extension ('u') in
|
||||
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// An empty value removes an existing pair with the same key.
|
||||
func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
|
||||
if t.IsPrivateUse() {
|
||||
return t, errPrivateUse
|
||||
}
|
||||
if len(key) != 2 {
|
||||
return t, errInvalidArguments
|
||||
}
|
||||
|
||||
// Remove the setting if value is "".
|
||||
if value == "" {
|
||||
start, sep, end, _ := t.findTypeForKey(key)
|
||||
if start != sep {
|
||||
// Remove a possible empty extension.
|
||||
switch {
|
||||
case t.str[start-2] != '-': // has previous elements.
|
||||
case end == len(t.str), // end of string
|
||||
end+2 < len(t.str) && t.str[end+2] == '-': // end of extension
|
||||
start -= 2
|
||||
}
|
||||
if start == int(t.pVariant) && end == len(t.str) {
|
||||
t.str = ""
|
||||
t.pVariant, t.pExt = 0, 0
|
||||
} else {
|
||||
t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
|
||||
}
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
if len(value) < 3 || len(value) > 8 {
|
||||
return t, errInvalidArguments
|
||||
}
|
||||
|
||||
var (
|
||||
buf [maxCoreSize + maxSimpleUExtensionSize]byte
|
||||
uStart int // start of the -u extension.
|
||||
)
|
||||
|
||||
// Generate the tag string if needed.
|
||||
if t.str == "" {
|
||||
uStart = t.genCoreBytes(buf[:])
|
||||
buf[uStart] = '-'
|
||||
uStart++
|
||||
}
|
||||
|
||||
// Create new key-type pair and parse it to verify.
|
||||
b := buf[uStart:]
|
||||
copy(b, "u-")
|
||||
copy(b[2:], key)
|
||||
b[4] = '-'
|
||||
b = b[:5+copy(b[5:], value)]
|
||||
scan := makeScanner(b)
|
||||
if parseExtensions(&scan); scan.err != nil {
|
||||
return t, scan.err
|
||||
}
|
||||
|
||||
// Assemble the replacement string.
|
||||
if t.str == "" {
|
||||
t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
|
||||
t.str = string(buf[:uStart+len(b)])
|
||||
} else {
|
||||
s := t.str
|
||||
start, sep, end, hasExt := t.findTypeForKey(key)
|
||||
if start == sep {
|
||||
if hasExt {
|
||||
b = b[2:]
|
||||
}
|
||||
t.str = fmt.Sprintf("%s-%s%s", s[:sep], b, s[end:])
|
||||
} else {
|
||||
t.str = fmt.Sprintf("%s-%s%s", s[:start+3], value, s[end:])
|
||||
}
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// findTypeForKey returns the start and end position for the type corresponding
|
||||
// to key or the point at which to insert the key-value pair if the type
|
||||
// wasn't found. The hasExt return value reports whether an -u extension was present.
|
||||
// Note: the extensions are typically very small and are likely to contain
|
||||
// only one key-type pair.
|
||||
func (t Tag) findTypeForKey(key string) (start, sep, end int, hasExt bool) {
|
||||
p := int(t.pExt)
|
||||
if len(key) != 2 || p == len(t.str) || p == 0 {
|
||||
return p, p, p, false
|
||||
}
|
||||
s := t.str
|
||||
|
||||
// Find the correct extension.
|
||||
for p++; s[p] != 'u'; p++ {
|
||||
if s[p] > 'u' {
|
||||
p--
|
||||
return p, p, p, false
|
||||
}
|
||||
if p = nextExtension(s, p); p == len(s) {
|
||||
return len(s), len(s), len(s), false
|
||||
}
|
||||
}
|
||||
// Proceed to the hyphen following the extension name.
|
||||
p++
|
||||
|
||||
// curKey is the key currently being processed.
|
||||
curKey := ""
|
||||
|
||||
// Iterate over keys until we get the end of a section.
|
||||
for {
|
||||
end = p
|
||||
for p++; p < len(s) && s[p] != '-'; p++ {
|
||||
}
|
||||
n := p - end - 1
|
||||
if n <= 2 && curKey == key {
|
||||
if sep < end {
|
||||
sep++
|
||||
}
|
||||
return start, sep, end, true
|
||||
}
|
||||
switch n {
|
||||
case 0, // invalid string
|
||||
1: // next extension
|
||||
return end, end, end, true
|
||||
case 2:
|
||||
// next key
|
||||
curKey = s[end+1 : p]
|
||||
if curKey > key {
|
||||
return end, end, end, true
|
||||
}
|
||||
start = end
|
||||
sep = p
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ParseBase parses a 2- or 3-letter ISO 639 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown language identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseBase(s string) (l Language, err error) {
|
||||
defer func() {
|
||||
if recover() != nil {
|
||||
l = 0
|
||||
err = ErrSyntax
|
||||
}
|
||||
}()
|
||||
|
||||
if n := len(s); n < 2 || 3 < n {
|
||||
return 0, ErrSyntax
|
||||
}
|
||||
var buf [3]byte
|
||||
return getLangID(buf[:copy(buf[:], s)])
|
||||
}
|
||||
|
||||
// ParseScript parses a 4-letter ISO 15924 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown script identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseScript(s string) (scr Script, err error) {
|
||||
defer func() {
|
||||
if recover() != nil {
|
||||
scr = 0
|
||||
err = ErrSyntax
|
||||
}
|
||||
}()
|
||||
|
||||
if len(s) != 4 {
|
||||
return 0, ErrSyntax
|
||||
}
|
||||
var buf [4]byte
|
||||
return getScriptID(script, buf[:copy(buf[:], s)])
|
||||
}
|
||||
|
||||
// EncodeM49 returns the Region for the given UN M.49 code.
|
||||
// It returns an error if r is not a valid code.
|
||||
func EncodeM49(r int) (Region, error) {
|
||||
return getRegionM49(r)
|
||||
}
|
||||
|
||||
// ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown region identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseRegion(s string) (r Region, err error) {
|
||||
defer func() {
|
||||
if recover() != nil {
|
||||
r = 0
|
||||
err = ErrSyntax
|
||||
}
|
||||
}()
|
||||
|
||||
if n := len(s); n < 2 || 3 < n {
|
||||
return 0, ErrSyntax
|
||||
}
|
||||
var buf [3]byte
|
||||
return getRegionID(buf[:copy(buf[:], s)])
|
||||
}
|
||||
|
||||
// IsCountry returns whether this region is a country or autonomous area. This
|
||||
// includes non-standard definitions from CLDR.
|
||||
func (r Region) IsCountry() bool {
|
||||
if r == 0 || r.IsGroup() || r.IsPrivateUse() && r != _XK {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// IsGroup returns whether this region defines a collection of regions. This
|
||||
// includes non-standard definitions from CLDR.
|
||||
func (r Region) IsGroup() bool {
|
||||
if r == 0 {
|
||||
return false
|
||||
}
|
||||
return int(regionInclusion[r]) < len(regionContainment)
|
||||
}
|
||||
|
||||
// Contains returns whether Region c is contained by Region r. It returns true
|
||||
// if c == r.
|
||||
func (r Region) Contains(c Region) bool {
|
||||
if r == c {
|
||||
return true
|
||||
}
|
||||
g := regionInclusion[r]
|
||||
if g >= nRegionGroups {
|
||||
return false
|
||||
}
|
||||
m := regionContainment[g]
|
||||
|
||||
d := regionInclusion[c]
|
||||
b := regionInclusionBits[d]
|
||||
|
||||
// A contained country may belong to multiple disjoint groups. Matching any
|
||||
// of these indicates containment. If the contained region is a group, it
|
||||
// must strictly be a subset.
|
||||
if d >= nRegionGroups {
|
||||
return b&m != 0
|
||||
}
|
||||
return b&^m == 0
|
||||
}
|
||||
|
||||
var errNoTLD = errors.New("language: region is not a valid ccTLD")
|
||||
|
||||
// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
|
||||
// In all other cases it returns either the region itself or an error.
|
||||
//
|
||||
// This method may return an error for a region for which there exists a
|
||||
// canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
|
||||
// region will already be canonicalized it was obtained from a Tag that was
|
||||
// obtained using any of the default methods.
|
||||
func (r Region) TLD() (Region, error) {
|
||||
// See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
|
||||
// difference between ISO 3166-1 and IANA ccTLD.
|
||||
if r == _GB {
|
||||
r = _UK
|
||||
}
|
||||
if (r.typ() & ccTLD) == 0 {
|
||||
return 0, errNoTLD
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// Canonicalize returns the region or a possible replacement if the region is
|
||||
// deprecated. It will not return a replacement for deprecated regions that
|
||||
// are split into multiple regions.
|
||||
func (r Region) Canonicalize() Region {
|
||||
if cr := normRegion(r); cr != 0 {
|
||||
return cr
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// Variant represents a registered variant of a language as defined by BCP 47.
|
||||
type Variant struct {
|
||||
ID uint8
|
||||
str string
|
||||
}
|
||||
|
||||
// ParseVariant parses and returns a Variant. An error is returned if s is not
|
||||
// a valid variant.
|
||||
func ParseVariant(s string) (v Variant, err error) {
|
||||
defer func() {
|
||||
if recover() != nil {
|
||||
v = Variant{}
|
||||
err = ErrSyntax
|
||||
}
|
||||
}()
|
||||
|
||||
s = strings.ToLower(s)
|
||||
if id, ok := variantIndex[s]; ok {
|
||||
return Variant{id, s}, nil
|
||||
}
|
||||
return Variant{}, NewValueError([]byte(s))
|
||||
}
|
||||
|
||||
// String returns the string representation of the variant.
|
||||
func (v Variant) String() string {
|
||||
return v.str
|
||||
}
|
|
@ -0,0 +1,412 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
|
||||
"golang.org/x/text/internal/tag"
|
||||
)
|
||||
|
||||
// findIndex tries to find the given tag in idx and returns a standardized error
|
||||
// if it could not be found.
|
||||
func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
|
||||
if !tag.FixCase(form, key) {
|
||||
return 0, ErrSyntax
|
||||
}
|
||||
i := idx.Index(key)
|
||||
if i == -1 {
|
||||
return 0, NewValueError(key)
|
||||
}
|
||||
return i, nil
|
||||
}
|
||||
|
||||
func searchUint(imap []uint16, key uint16) int {
|
||||
return sort.Search(len(imap), func(i int) bool {
|
||||
return imap[i] >= key
|
||||
})
|
||||
}
|
||||
|
||||
type Language uint16
|
||||
|
||||
// getLangID returns the langID of s if s is a canonical subtag
|
||||
// or langUnknown if s is not a canonical subtag.
|
||||
func getLangID(s []byte) (Language, error) {
|
||||
if len(s) == 2 {
|
||||
return getLangISO2(s)
|
||||
}
|
||||
return getLangISO3(s)
|
||||
}
|
||||
|
||||
// TODO language normalization as well as the AliasMaps could be moved to the
|
||||
// higher level package, but it is a bit tricky to separate the generation.
|
||||
|
||||
func (id Language) Canonicalize() (Language, AliasType) {
|
||||
return normLang(id)
|
||||
}
|
||||
|
||||
// normLang returns the mapped langID of id according to mapping m.
|
||||
func normLang(id Language) (Language, AliasType) {
|
||||
k := sort.Search(len(AliasMap), func(i int) bool {
|
||||
return AliasMap[i].From >= uint16(id)
|
||||
})
|
||||
if k < len(AliasMap) && AliasMap[k].From == uint16(id) {
|
||||
return Language(AliasMap[k].To), AliasTypes[k]
|
||||
}
|
||||
return id, AliasTypeUnknown
|
||||
}
|
||||
|
||||
// getLangISO2 returns the langID for the given 2-letter ISO language code
|
||||
// or unknownLang if this does not exist.
|
||||
func getLangISO2(s []byte) (Language, error) {
|
||||
if !tag.FixCase("zz", s) {
|
||||
return 0, ErrSyntax
|
||||
}
|
||||
if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
|
||||
return Language(i), nil
|
||||
}
|
||||
return 0, NewValueError(s)
|
||||
}
|
||||
|
||||
const base = 'z' - 'a' + 1
|
||||
|
||||
func strToInt(s []byte) uint {
|
||||
v := uint(0)
|
||||
for i := 0; i < len(s); i++ {
|
||||
v *= base
|
||||
v += uint(s[i] - 'a')
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// converts the given integer to the original ASCII string passed to strToInt.
|
||||
// len(s) must match the number of characters obtained.
|
||||
func intToStr(v uint, s []byte) {
|
||||
for i := len(s) - 1; i >= 0; i-- {
|
||||
s[i] = byte(v%base) + 'a'
|
||||
v /= base
|
||||
}
|
||||
}
|
||||
|
||||
// getLangISO3 returns the langID for the given 3-letter ISO language code
|
||||
// or unknownLang if this does not exist.
|
||||
func getLangISO3(s []byte) (Language, error) {
|
||||
if tag.FixCase("und", s) {
|
||||
// first try to match canonical 3-letter entries
|
||||
for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
|
||||
if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {
|
||||
// We treat "und" as special and always translate it to "unspecified".
|
||||
// Note that ZZ and Zzzz are private use and are not treated as
|
||||
// unspecified by default.
|
||||
id := Language(i)
|
||||
if id == nonCanonicalUnd {
|
||||
return 0, nil
|
||||
}
|
||||
return id, nil
|
||||
}
|
||||
}
|
||||
if i := altLangISO3.Index(s); i != -1 {
|
||||
return Language(altLangIndex[altLangISO3.Elem(i)[3]]), nil
|
||||
}
|
||||
n := strToInt(s)
|
||||
if langNoIndex[n/8]&(1<<(n%8)) != 0 {
|
||||
return Language(n) + langNoIndexOffset, nil
|
||||
}
|
||||
// Check for non-canonical uses of ISO3.
|
||||
for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
|
||||
if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
|
||||
return Language(i), nil
|
||||
}
|
||||
}
|
||||
return 0, NewValueError(s)
|
||||
}
|
||||
return 0, ErrSyntax
|
||||
}
|
||||
|
||||
// StringToBuf writes the string to b and returns the number of bytes
|
||||
// written. cap(b) must be >= 3.
|
||||
func (id Language) StringToBuf(b []byte) int {
|
||||
if id >= langNoIndexOffset {
|
||||
intToStr(uint(id)-langNoIndexOffset, b[:3])
|
||||
return 3
|
||||
} else if id == 0 {
|
||||
return copy(b, "und")
|
||||
}
|
||||
l := lang[id<<2:]
|
||||
if l[3] == 0 {
|
||||
return copy(b, l[:3])
|
||||
}
|
||||
return copy(b, l[:2])
|
||||
}
|
||||
|
||||
// String returns the BCP 47 representation of the langID.
|
||||
// Use b as variable name, instead of id, to ensure the variable
|
||||
// used is consistent with that of Base in which this type is embedded.
|
||||
func (b Language) String() string {
|
||||
if b == 0 {
|
||||
return "und"
|
||||
} else if b >= langNoIndexOffset {
|
||||
b -= langNoIndexOffset
|
||||
buf := [3]byte{}
|
||||
intToStr(uint(b), buf[:])
|
||||
return string(buf[:])
|
||||
}
|
||||
l := lang.Elem(int(b))
|
||||
if l[3] == 0 {
|
||||
return l[:3]
|
||||
}
|
||||
return l[:2]
|
||||
}
|
||||
|
||||
// ISO3 returns the ISO 639-3 language code.
|
||||
func (b Language) ISO3() string {
|
||||
if b == 0 || b >= langNoIndexOffset {
|
||||
return b.String()
|
||||
}
|
||||
l := lang.Elem(int(b))
|
||||
if l[3] == 0 {
|
||||
return l[:3]
|
||||
} else if l[2] == 0 {
|
||||
return altLangISO3.Elem(int(l[3]))[:3]
|
||||
}
|
||||
// This allocation will only happen for 3-letter ISO codes
|
||||
// that are non-canonical BCP 47 language identifiers.
|
||||
return l[0:1] + l[2:4]
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether this language code is reserved for private use.
|
||||
func (b Language) IsPrivateUse() bool {
|
||||
return langPrivateStart <= b && b <= langPrivateEnd
|
||||
}
|
||||
|
||||
// SuppressScript returns the script marked as SuppressScript in the IANA
|
||||
// language tag repository, or 0 if there is no such script.
|
||||
func (b Language) SuppressScript() Script {
|
||||
if b < langNoIndexOffset {
|
||||
return Script(suppressScript[b])
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
type Region uint16
|
||||
|
||||
// getRegionID returns the region id for s if s is a valid 2-letter region code
|
||||
// or unknownRegion.
|
||||
func getRegionID(s []byte) (Region, error) {
|
||||
if len(s) == 3 {
|
||||
if isAlpha(s[0]) {
|
||||
return getRegionISO3(s)
|
||||
}
|
||||
if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {
|
||||
return getRegionM49(int(i))
|
||||
}
|
||||
}
|
||||
return getRegionISO2(s)
|
||||
}
|
||||
|
||||
// getRegionISO2 returns the regionID for the given 2-letter ISO country code
|
||||
// or unknownRegion if this does not exist.
|
||||
func getRegionISO2(s []byte) (Region, error) {
|
||||
i, err := findIndex(regionISO, s, "ZZ")
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return Region(i) + isoRegionOffset, nil
|
||||
}
|
||||
|
||||
// getRegionISO3 returns the regionID for the given 3-letter ISO country code
|
||||
// or unknownRegion if this does not exist.
|
||||
func getRegionISO3(s []byte) (Region, error) {
|
||||
if tag.FixCase("ZZZ", s) {
|
||||
for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
|
||||
if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
|
||||
return Region(i) + isoRegionOffset, nil
|
||||
}
|
||||
}
|
||||
for i := 0; i < len(altRegionISO3); i += 3 {
|
||||
if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
|
||||
return Region(altRegionIDs[i/3]), nil
|
||||
}
|
||||
}
|
||||
return 0, NewValueError(s)
|
||||
}
|
||||
return 0, ErrSyntax
|
||||
}
|
||||
|
||||
func getRegionM49(n int) (Region, error) {
|
||||
if 0 < n && n <= 999 {
|
||||
const (
|
||||
searchBits = 7
|
||||
regionBits = 9
|
||||
regionMask = 1<<regionBits - 1
|
||||
)
|
||||
idx := n >> searchBits
|
||||
buf := fromM49[m49Index[idx]:m49Index[idx+1]]
|
||||
val := uint16(n) << regionBits // we rely on bits shifting out
|
||||
i := sort.Search(len(buf), func(i int) bool {
|
||||
return buf[i] >= val
|
||||
})
|
||||
if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
|
||||
return Region(r & regionMask), nil
|
||||
}
|
||||
}
|
||||
var e ValueError
|
||||
fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)
|
||||
return 0, e
|
||||
}
|
||||
|
||||
// normRegion returns a region if r is deprecated or 0 otherwise.
|
||||
// TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
|
||||
// TODO: consider mapping split up regions to new most populous one (like CLDR).
|
||||
func normRegion(r Region) Region {
|
||||
m := regionOldMap
|
||||
k := sort.Search(len(m), func(i int) bool {
|
||||
return m[i].From >= uint16(r)
|
||||
})
|
||||
if k < len(m) && m[k].From == uint16(r) {
|
||||
return Region(m[k].To)
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
const (
|
||||
iso3166UserAssigned = 1 << iota
|
||||
ccTLD
|
||||
bcp47Region
|
||||
)
|
||||
|
||||
func (r Region) typ() byte {
|
||||
return regionTypes[r]
|
||||
}
|
||||
|
||||
// String returns the BCP 47 representation for the region.
|
||||
// It returns "ZZ" for an unspecified region.
|
||||
func (r Region) String() string {
|
||||
if r < isoRegionOffset {
|
||||
if r == 0 {
|
||||
return "ZZ"
|
||||
}
|
||||
return fmt.Sprintf("%03d", r.M49())
|
||||
}
|
||||
r -= isoRegionOffset
|
||||
return regionISO.Elem(int(r))[:2]
|
||||
}
|
||||
|
||||
// ISO3 returns the 3-letter ISO code of r.
|
||||
// Note that not all regions have a 3-letter ISO code.
|
||||
// In such cases this method returns "ZZZ".
|
||||
func (r Region) ISO3() string {
|
||||
if r < isoRegionOffset {
|
||||
return "ZZZ"
|
||||
}
|
||||
r -= isoRegionOffset
|
||||
reg := regionISO.Elem(int(r))
|
||||
switch reg[2] {
|
||||
case 0:
|
||||
return altRegionISO3[reg[3]:][:3]
|
||||
case ' ':
|
||||
return "ZZZ"
|
||||
}
|
||||
return reg[0:1] + reg[2:4]
|
||||
}
|
||||
|
||||
// M49 returns the UN M.49 encoding of r, or 0 if this encoding
|
||||
// is not defined for r.
|
||||
func (r Region) M49() int {
|
||||
return int(m49[r])
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
|
||||
// may include private-use tags that are assigned by CLDR and used in this
|
||||
// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
|
||||
func (r Region) IsPrivateUse() bool {
|
||||
return r.typ()&iso3166UserAssigned != 0
|
||||
}
|
||||
|
||||
type Script uint16
|
||||
|
||||
// getScriptID returns the script id for string s. It assumes that s
|
||||
// is of the format [A-Z][a-z]{3}.
|
||||
func getScriptID(idx tag.Index, s []byte) (Script, error) {
|
||||
i, err := findIndex(idx, s, "Zzzz")
|
||||
return Script(i), err
|
||||
}
|
||||
|
||||
// String returns the script code in title case.
|
||||
// It returns "Zzzz" for an unspecified script.
|
||||
func (s Script) String() string {
|
||||
if s == 0 {
|
||||
return "Zzzz"
|
||||
}
|
||||
return script.Elem(int(s))
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether this script code is reserved for private use.
|
||||
func (s Script) IsPrivateUse() bool {
|
||||
return _Qaaa <= s && s <= _Qabx
|
||||
}
|
||||
|
||||
const (
|
||||
maxAltTaglen = len("en-US-POSIX")
|
||||
maxLen = maxAltTaglen
|
||||
)
|
||||
|
||||
var (
|
||||
// grandfatheredMap holds a mapping from legacy and grandfathered tags to
|
||||
// their base language or index to more elaborate tag.
|
||||
grandfatheredMap = map[[maxLen]byte]int16{
|
||||
[maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban
|
||||
[maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami
|
||||
[maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn
|
||||
[maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak
|
||||
[maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon
|
||||
[maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux
|
||||
[maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo
|
||||
[maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn
|
||||
[maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao
|
||||
[maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay
|
||||
[maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu
|
||||
[maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok
|
||||
[maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn
|
||||
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR
|
||||
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL
|
||||
[maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE
|
||||
[maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu
|
||||
[maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka
|
||||
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan
|
||||
[maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang
|
||||
|
||||
// Grandfathered tags with no modern replacement will be converted as
|
||||
// follows:
|
||||
[maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish
|
||||
[maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed
|
||||
[maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default
|
||||
[maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian
|
||||
[maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo
|
||||
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min
|
||||
|
||||
// CLDR-specific tag.
|
||||
[maxLen]byte{'r', 'o', 'o', 't'}: 0, // root
|
||||
[maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"
|
||||
}
|
||||
|
||||
altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}
|
||||
|
||||
altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"
|
||||
)
|
||||
|
||||
func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
|
||||
if v, ok := grandfatheredMap[s]; ok {
|
||||
if v < 0 {
|
||||
return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
|
||||
}
|
||||
t.LangID = Language(v)
|
||||
return t, true
|
||||
}
|
||||
return t, false
|
||||
}
|
|
@ -0,0 +1,226 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import "errors"
|
||||
|
||||
type scriptRegionFlags uint8
|
||||
|
||||
const (
|
||||
isList = 1 << iota
|
||||
scriptInFrom
|
||||
regionInFrom
|
||||
)
|
||||
|
||||
func (t *Tag) setUndefinedLang(id Language) {
|
||||
if t.LangID == 0 {
|
||||
t.LangID = id
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tag) setUndefinedScript(id Script) {
|
||||
if t.ScriptID == 0 {
|
||||
t.ScriptID = id
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tag) setUndefinedRegion(id Region) {
|
||||
if t.RegionID == 0 || t.RegionID.Contains(id) {
|
||||
t.RegionID = id
|
||||
}
|
||||
}
|
||||
|
||||
// ErrMissingLikelyTagsData indicates no information was available
|
||||
// to compute likely values of missing tags.
|
||||
var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
|
||||
|
||||
// addLikelySubtags sets subtags to their most likely value, given the locale.
|
||||
// In most cases this means setting fields for unknown values, but in some
|
||||
// cases it may alter a value. It returns an ErrMissingLikelyTagsData error
|
||||
// if the given locale cannot be expanded.
|
||||
func (t Tag) addLikelySubtags() (Tag, error) {
|
||||
id, err := addTags(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
} else if id.equalTags(t) {
|
||||
return t, nil
|
||||
}
|
||||
id.RemakeString()
|
||||
return id, nil
|
||||
}
|
||||
|
||||
// specializeRegion attempts to specialize a group region.
|
||||
func specializeRegion(t *Tag) bool {
|
||||
if i := regionInclusion[t.RegionID]; i < nRegionGroups {
|
||||
x := likelyRegionGroup[i]
|
||||
if Language(x.lang) == t.LangID && Script(x.script) == t.ScriptID {
|
||||
t.RegionID = Region(x.region)
|
||||
}
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Maximize returns a new tag with missing tags filled in.
|
||||
func (t Tag) Maximize() (Tag, error) {
|
||||
return addTags(t)
|
||||
}
|
||||
|
||||
func addTags(t Tag) (Tag, error) {
|
||||
// We leave private use identifiers alone.
|
||||
if t.IsPrivateUse() {
|
||||
return t, nil
|
||||
}
|
||||
if t.ScriptID != 0 && t.RegionID != 0 {
|
||||
if t.LangID != 0 {
|
||||
// already fully specified
|
||||
specializeRegion(&t)
|
||||
return t, nil
|
||||
}
|
||||
// Search matches for und-script-region. Note that for these cases
|
||||
// region will never be a group so there is no need to check for this.
|
||||
list := likelyRegion[t.RegionID : t.RegionID+1]
|
||||
if x := list[0]; x.flags&isList != 0 {
|
||||
list = likelyRegionList[x.lang : x.lang+uint16(x.script)]
|
||||
}
|
||||
for _, x := range list {
|
||||
// Deviating from the spec. See match_test.go for details.
|
||||
if Script(x.script) == t.ScriptID {
|
||||
t.setUndefinedLang(Language(x.lang))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
if t.LangID != 0 {
|
||||
// Search matches for lang-script and lang-region, where lang != und.
|
||||
if t.LangID < langNoIndexOffset {
|
||||
x := likelyLang[t.LangID]
|
||||
if x.flags&isList != 0 {
|
||||
list := likelyLangList[x.region : x.region+uint16(x.script)]
|
||||
if t.ScriptID != 0 {
|
||||
for _, x := range list {
|
||||
if Script(x.script) == t.ScriptID && x.flags&scriptInFrom != 0 {
|
||||
t.setUndefinedRegion(Region(x.region))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
} else if t.RegionID != 0 {
|
||||
count := 0
|
||||
goodScript := true
|
||||
tt := t
|
||||
for _, x := range list {
|
||||
// We visit all entries for which the script was not
|
||||
// defined, including the ones where the region was not
|
||||
// defined. This allows for proper disambiguation within
|
||||
// regions.
|
||||
if x.flags&scriptInFrom == 0 && t.RegionID.Contains(Region(x.region)) {
|
||||
tt.RegionID = Region(x.region)
|
||||
tt.setUndefinedScript(Script(x.script))
|
||||
goodScript = goodScript && tt.ScriptID == Script(x.script)
|
||||
count++
|
||||
}
|
||||
}
|
||||
if count == 1 {
|
||||
return tt, nil
|
||||
}
|
||||
// Even if we fail to find a unique Region, we might have
|
||||
// an unambiguous script.
|
||||
if goodScript {
|
||||
t.ScriptID = tt.ScriptID
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Search matches for und-script.
|
||||
if t.ScriptID != 0 {
|
||||
x := likelyScript[t.ScriptID]
|
||||
if x.region != 0 {
|
||||
t.setUndefinedRegion(Region(x.region))
|
||||
t.setUndefinedLang(Language(x.lang))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
// Search matches for und-region. If und-script-region exists, it would
|
||||
// have been found earlier.
|
||||
if t.RegionID != 0 {
|
||||
if i := regionInclusion[t.RegionID]; i < nRegionGroups {
|
||||
x := likelyRegionGroup[i]
|
||||
if x.region != 0 {
|
||||
t.setUndefinedLang(Language(x.lang))
|
||||
t.setUndefinedScript(Script(x.script))
|
||||
t.RegionID = Region(x.region)
|
||||
}
|
||||
} else {
|
||||
x := likelyRegion[t.RegionID]
|
||||
if x.flags&isList != 0 {
|
||||
x = likelyRegionList[x.lang]
|
||||
}
|
||||
if x.script != 0 && x.flags != scriptInFrom {
|
||||
t.setUndefinedLang(Language(x.lang))
|
||||
t.setUndefinedScript(Script(x.script))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Search matches for lang.
|
||||
if t.LangID < langNoIndexOffset {
|
||||
x := likelyLang[t.LangID]
|
||||
if x.flags&isList != 0 {
|
||||
x = likelyLangList[x.region]
|
||||
}
|
||||
if x.region != 0 {
|
||||
t.setUndefinedScript(Script(x.script))
|
||||
t.setUndefinedRegion(Region(x.region))
|
||||
}
|
||||
specializeRegion(&t)
|
||||
if t.LangID == 0 {
|
||||
t.LangID = _en // default language
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
return t, ErrMissingLikelyTagsData
|
||||
}
|
||||
|
||||
func (t *Tag) setTagsFrom(id Tag) {
|
||||
t.LangID = id.LangID
|
||||
t.ScriptID = id.ScriptID
|
||||
t.RegionID = id.RegionID
|
||||
}
|
||||
|
||||
// minimize removes the region or script subtags from t such that
|
||||
// t.addLikelySubtags() == t.minimize().addLikelySubtags().
|
||||
func (t Tag) minimize() (Tag, error) {
|
||||
t, err := minimizeTags(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
}
|
||||
t.RemakeString()
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// minimizeTags mimics the behavior of the ICU 51 C implementation.
|
||||
func minimizeTags(t Tag) (Tag, error) {
|
||||
if t.equalTags(Und) {
|
||||
return t, nil
|
||||
}
|
||||
max, err := addTags(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
}
|
||||
for _, id := range [...]Tag{
|
||||
{LangID: t.LangID},
|
||||
{LangID: t.LangID, RegionID: t.RegionID},
|
||||
{LangID: t.LangID, ScriptID: t.ScriptID},
|
||||
} {
|
||||
if x, err := addTags(id); err == nil && max.equalTags(x) {
|
||||
t.setTagsFrom(id)
|
||||
break
|
||||
}
|
||||
}
|
||||
return t, nil
|
||||
}
|
|
@ -0,0 +1,608 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"golang.org/x/text/internal/tag"
|
||||
)
|
||||
|
||||
// isAlpha returns true if the byte is not a digit.
|
||||
// b must be an ASCII letter or digit.
|
||||
func isAlpha(b byte) bool {
|
||||
return b > '9'
|
||||
}
|
||||
|
||||
// isAlphaNum returns true if the string contains only ASCII letters or digits.
|
||||
func isAlphaNum(s []byte) bool {
|
||||
for _, c := range s {
|
||||
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// ErrSyntax is returned by any of the parsing functions when the
|
||||
// input is not well-formed, according to BCP 47.
|
||||
// TODO: return the position at which the syntax error occurred?
|
||||
var ErrSyntax = errors.New("language: tag is not well-formed")
|
||||
|
||||
// ErrDuplicateKey is returned when a tag contains the same key twice with
|
||||
// different values in the -u section.
|
||||
var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
|
||||
|
||||
// ValueError is returned by any of the parsing functions when the
|
||||
// input is well-formed but the respective subtag is not recognized
|
||||
// as a valid value.
|
||||
type ValueError struct {
|
||||
v [8]byte
|
||||
}
|
||||
|
||||
// NewValueError creates a new ValueError.
|
||||
func NewValueError(tag []byte) ValueError {
|
||||
var e ValueError
|
||||
copy(e.v[:], tag)
|
||||
return e
|
||||
}
|
||||
|
||||
func (e ValueError) tag() []byte {
|
||||
n := bytes.IndexByte(e.v[:], 0)
|
||||
if n == -1 {
|
||||
n = 8
|
||||
}
|
||||
return e.v[:n]
|
||||
}
|
||||
|
||||
// Error implements the error interface.
|
||||
func (e ValueError) Error() string {
|
||||
return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
|
||||
}
|
||||
|
||||
// Subtag returns the subtag for which the error occurred.
|
||||
func (e ValueError) Subtag() string {
|
||||
return string(e.tag())
|
||||
}
|
||||
|
||||
// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
|
||||
type scanner struct {
|
||||
b []byte
|
||||
bytes [max99thPercentileSize]byte
|
||||
token []byte
|
||||
start int // start position of the current token
|
||||
end int // end position of the current token
|
||||
next int // next point for scan
|
||||
err error
|
||||
done bool
|
||||
}
|
||||
|
||||
func makeScannerString(s string) scanner {
|
||||
scan := scanner{}
|
||||
if len(s) <= len(scan.bytes) {
|
||||
scan.b = scan.bytes[:copy(scan.bytes[:], s)]
|
||||
} else {
|
||||
scan.b = []byte(s)
|
||||
}
|
||||
scan.init()
|
||||
return scan
|
||||
}
|
||||
|
||||
// makeScanner returns a scanner using b as the input buffer.
|
||||
// b is not copied and may be modified by the scanner routines.
|
||||
func makeScanner(b []byte) scanner {
|
||||
scan := scanner{b: b}
|
||||
scan.init()
|
||||
return scan
|
||||
}
|
||||
|
||||
func (s *scanner) init() {
|
||||
for i, c := range s.b {
|
||||
if c == '_' {
|
||||
s.b[i] = '-'
|
||||
}
|
||||
}
|
||||
s.scan()
|
||||
}
|
||||
|
||||
// restToLower converts the string between start and end to lower case.
|
||||
func (s *scanner) toLower(start, end int) {
|
||||
for i := start; i < end; i++ {
|
||||
c := s.b[i]
|
||||
if 'A' <= c && c <= 'Z' {
|
||||
s.b[i] += 'a' - 'A'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *scanner) setError(e error) {
|
||||
if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
|
||||
s.err = e
|
||||
}
|
||||
}
|
||||
|
||||
// resizeRange shrinks or grows the array at position oldStart such that
|
||||
// a new string of size newSize can fit between oldStart and oldEnd.
|
||||
// Sets the scan point to after the resized range.
|
||||
func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
|
||||
s.start = oldStart
|
||||
if end := oldStart + newSize; end != oldEnd {
|
||||
diff := end - oldEnd
|
||||
var b []byte
|
||||
if n := len(s.b) + diff; n > cap(s.b) {
|
||||
b = make([]byte, n)
|
||||
copy(b, s.b[:oldStart])
|
||||
} else {
|
||||
b = s.b[:n]
|
||||
}
|
||||
copy(b[end:], s.b[oldEnd:])
|
||||
s.b = b
|
||||
s.next = end + (s.next - s.end)
|
||||
s.end = end
|
||||
}
|
||||
}
|
||||
|
||||
// replace replaces the current token with repl.
|
||||
func (s *scanner) replace(repl string) {
|
||||
s.resizeRange(s.start, s.end, len(repl))
|
||||
copy(s.b[s.start:], repl)
|
||||
}
|
||||
|
||||
// gobble removes the current token from the input.
|
||||
// Caller must call scan after calling gobble.
|
||||
func (s *scanner) gobble(e error) {
|
||||
s.setError(e)
|
||||
if s.start == 0 {
|
||||
s.b = s.b[:+copy(s.b, s.b[s.next:])]
|
||||
s.end = 0
|
||||
} else {
|
||||
s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
|
||||
s.end = s.start - 1
|
||||
}
|
||||
s.next = s.start
|
||||
}
|
||||
|
||||
// deleteRange removes the given range from s.b before the current token.
|
||||
func (s *scanner) deleteRange(start, end int) {
|
||||
s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
|
||||
diff := end - start
|
||||
s.next -= diff
|
||||
s.start -= diff
|
||||
s.end -= diff
|
||||
}
|
||||
|
||||
// scan parses the next token of a BCP 47 string. Tokens that are larger
|
||||
// than 8 characters or include non-alphanumeric characters result in an error
|
||||
// and are gobbled and removed from the output.
|
||||
// It returns the end position of the last token consumed.
|
||||
func (s *scanner) scan() (end int) {
|
||||
end = s.end
|
||||
s.token = nil
|
||||
for s.start = s.next; s.next < len(s.b); {
|
||||
i := bytes.IndexByte(s.b[s.next:], '-')
|
||||
if i == -1 {
|
||||
s.end = len(s.b)
|
||||
s.next = len(s.b)
|
||||
i = s.end - s.start
|
||||
} else {
|
||||
s.end = s.next + i
|
||||
s.next = s.end + 1
|
||||
}
|
||||
token := s.b[s.start:s.end]
|
||||
if i < 1 || i > 8 || !isAlphaNum(token) {
|
||||
s.gobble(ErrSyntax)
|
||||
continue
|
||||
}
|
||||
s.token = token
|
||||
return end
|
||||
}
|
||||
if n := len(s.b); n > 0 && s.b[n-1] == '-' {
|
||||
s.setError(ErrSyntax)
|
||||
s.b = s.b[:len(s.b)-1]
|
||||
}
|
||||
s.done = true
|
||||
return end
|
||||
}
|
||||
|
||||
// acceptMinSize parses multiple tokens of the given size or greater.
|
||||
// It returns the end position of the last token consumed.
|
||||
func (s *scanner) acceptMinSize(min int) (end int) {
|
||||
end = s.end
|
||||
s.scan()
|
||||
for ; len(s.token) >= min; s.scan() {
|
||||
end = s.end
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
|
||||
// failed it returns an error and any part of the tag that could be parsed.
|
||||
// If parsing succeeded but an unknown value was found, it returns
|
||||
// ValueError. The Tag returned in this case is just stripped of the unknown
|
||||
// value. All other values are preserved. It accepts tags in the BCP 47 format
|
||||
// and extensions to this standard defined in
|
||||
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
func Parse(s string) (t Tag, err error) {
|
||||
// TODO: consider supporting old-style locale key-value pairs.
|
||||
if s == "" {
|
||||
return Und, ErrSyntax
|
||||
}
|
||||
defer func() {
|
||||
if recover() != nil {
|
||||
t = Und
|
||||
err = ErrSyntax
|
||||
return
|
||||
}
|
||||
}()
|
||||
if len(s) <= maxAltTaglen {
|
||||
b := [maxAltTaglen]byte{}
|
||||
for i, c := range s {
|
||||
// Generating invalid UTF-8 is okay as it won't match.
|
||||
if 'A' <= c && c <= 'Z' {
|
||||
c += 'a' - 'A'
|
||||
} else if c == '_' {
|
||||
c = '-'
|
||||
}
|
||||
b[i] = byte(c)
|
||||
}
|
||||
if t, ok := grandfathered(b); ok {
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
scan := makeScannerString(s)
|
||||
return parse(&scan, s)
|
||||
}
|
||||
|
||||
func parse(scan *scanner, s string) (t Tag, err error) {
|
||||
t = Und
|
||||
var end int
|
||||
if n := len(scan.token); n <= 1 {
|
||||
scan.toLower(0, len(scan.b))
|
||||
if n == 0 || scan.token[0] != 'x' {
|
||||
return t, ErrSyntax
|
||||
}
|
||||
end = parseExtensions(scan)
|
||||
} else if n >= 4 {
|
||||
return Und, ErrSyntax
|
||||
} else { // the usual case
|
||||
t, end = parseTag(scan, true)
|
||||
if n := len(scan.token); n == 1 {
|
||||
t.pExt = uint16(end)
|
||||
end = parseExtensions(scan)
|
||||
} else if end < len(scan.b) {
|
||||
scan.setError(ErrSyntax)
|
||||
scan.b = scan.b[:end]
|
||||
}
|
||||
}
|
||||
if int(t.pVariant) < len(scan.b) {
|
||||
if end < len(s) {
|
||||
s = s[:end]
|
||||
}
|
||||
if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
|
||||
t.str = s
|
||||
} else {
|
||||
t.str = string(scan.b)
|
||||
}
|
||||
} else {
|
||||
t.pVariant, t.pExt = 0, 0
|
||||
}
|
||||
return t, scan.err
|
||||
}
|
||||
|
||||
// parseTag parses language, script, region and variants.
|
||||
// It returns a Tag and the end position in the input that was parsed.
|
||||
// If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
|
||||
func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
|
||||
var e error
|
||||
// TODO: set an error if an unknown lang, script or region is encountered.
|
||||
t.LangID, e = getLangID(scan.token)
|
||||
scan.setError(e)
|
||||
scan.replace(t.LangID.String())
|
||||
langStart := scan.start
|
||||
end = scan.scan()
|
||||
for len(scan.token) == 3 && isAlpha(scan.token[0]) {
|
||||
// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
|
||||
// to a tag of the form <extlang>.
|
||||
if doNorm {
|
||||
lang, e := getLangID(scan.token)
|
||||
if lang != 0 {
|
||||
t.LangID = lang
|
||||
langStr := lang.String()
|
||||
copy(scan.b[langStart:], langStr)
|
||||
scan.b[langStart+len(langStr)] = '-'
|
||||
scan.start = langStart + len(langStr) + 1
|
||||
}
|
||||
scan.gobble(e)
|
||||
}
|
||||
end = scan.scan()
|
||||
}
|
||||
if len(scan.token) == 4 && isAlpha(scan.token[0]) {
|
||||
t.ScriptID, e = getScriptID(script, scan.token)
|
||||
if t.ScriptID == 0 {
|
||||
scan.gobble(e)
|
||||
}
|
||||
end = scan.scan()
|
||||
}
|
||||
if n := len(scan.token); n >= 2 && n <= 3 {
|
||||
t.RegionID, e = getRegionID(scan.token)
|
||||
if t.RegionID == 0 {
|
||||
scan.gobble(e)
|
||||
} else {
|
||||
scan.replace(t.RegionID.String())
|
||||
}
|
||||
end = scan.scan()
|
||||
}
|
||||
scan.toLower(scan.start, len(scan.b))
|
||||
t.pVariant = byte(end)
|
||||
end = parseVariants(scan, end, t)
|
||||
t.pExt = uint16(end)
|
||||
return t, end
|
||||
}
|
||||
|
||||
var separator = []byte{'-'}
|
||||
|
||||
// parseVariants scans tokens as long as each token is a valid variant string.
|
||||
// Duplicate variants are removed.
|
||||
func parseVariants(scan *scanner, end int, t Tag) int {
|
||||
start := scan.start
|
||||
varIDBuf := [4]uint8{}
|
||||
variantBuf := [4][]byte{}
|
||||
varID := varIDBuf[:0]
|
||||
variant := variantBuf[:0]
|
||||
last := -1
|
||||
needSort := false
|
||||
for ; len(scan.token) >= 4; scan.scan() {
|
||||
// TODO: measure the impact of needing this conversion and redesign
|
||||
// the data structure if there is an issue.
|
||||
v, ok := variantIndex[string(scan.token)]
|
||||
if !ok {
|
||||
// unknown variant
|
||||
// TODO: allow user-defined variants?
|
||||
scan.gobble(NewValueError(scan.token))
|
||||
continue
|
||||
}
|
||||
varID = append(varID, v)
|
||||
variant = append(variant, scan.token)
|
||||
if !needSort {
|
||||
if last < int(v) {
|
||||
last = int(v)
|
||||
} else {
|
||||
needSort = true
|
||||
// There is no legal combinations of more than 7 variants
|
||||
// (and this is by no means a useful sequence).
|
||||
const maxVariants = 8
|
||||
if len(varID) > maxVariants {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
end = scan.end
|
||||
}
|
||||
if needSort {
|
||||
sort.Sort(variantsSort{varID, variant})
|
||||
k, l := 0, -1
|
||||
for i, v := range varID {
|
||||
w := int(v)
|
||||
if l == w {
|
||||
// Remove duplicates.
|
||||
continue
|
||||
}
|
||||
varID[k] = varID[i]
|
||||
variant[k] = variant[i]
|
||||
k++
|
||||
l = w
|
||||
}
|
||||
if str := bytes.Join(variant[:k], separator); len(str) == 0 {
|
||||
end = start - 1
|
||||
} else {
|
||||
scan.resizeRange(start, end, len(str))
|
||||
copy(scan.b[scan.start:], str)
|
||||
end = scan.end
|
||||
}
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
type variantsSort struct {
|
||||
i []uint8
|
||||
v [][]byte
|
||||
}
|
||||
|
||||
func (s variantsSort) Len() int {
|
||||
return len(s.i)
|
||||
}
|
||||
|
||||
func (s variantsSort) Swap(i, j int) {
|
||||
s.i[i], s.i[j] = s.i[j], s.i[i]
|
||||
s.v[i], s.v[j] = s.v[j], s.v[i]
|
||||
}
|
||||
|
||||
func (s variantsSort) Less(i, j int) bool {
|
||||
return s.i[i] < s.i[j]
|
||||
}
|
||||
|
||||
type bytesSort struct {
|
||||
b [][]byte
|
||||
n int // first n bytes to compare
|
||||
}
|
||||
|
||||
func (b bytesSort) Len() int {
|
||||
return len(b.b)
|
||||
}
|
||||
|
||||
func (b bytesSort) Swap(i, j int) {
|
||||
b.b[i], b.b[j] = b.b[j], b.b[i]
|
||||
}
|
||||
|
||||
func (b bytesSort) Less(i, j int) bool {
|
||||
for k := 0; k < b.n; k++ {
|
||||
if b.b[i][k] == b.b[j][k] {
|
||||
continue
|
||||
}
|
||||
return b.b[i][k] < b.b[j][k]
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// parseExtensions parses and normalizes the extensions in the buffer.
|
||||
// It returns the last position of scan.b that is part of any extension.
|
||||
// It also trims scan.b to remove excess parts accordingly.
|
||||
func parseExtensions(scan *scanner) int {
|
||||
start := scan.start
|
||||
exts := [][]byte{}
|
||||
private := []byte{}
|
||||
end := scan.end
|
||||
for len(scan.token) == 1 {
|
||||
extStart := scan.start
|
||||
ext := scan.token[0]
|
||||
end = parseExtension(scan)
|
||||
extension := scan.b[extStart:end]
|
||||
if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
|
||||
scan.setError(ErrSyntax)
|
||||
end = extStart
|
||||
continue
|
||||
} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
|
||||
scan.b = scan.b[:end]
|
||||
return end
|
||||
} else if ext == 'x' {
|
||||
private = extension
|
||||
break
|
||||
}
|
||||
exts = append(exts, extension)
|
||||
}
|
||||
sort.Sort(bytesSort{exts, 1})
|
||||
if len(private) > 0 {
|
||||
exts = append(exts, private)
|
||||
}
|
||||
scan.b = scan.b[:start]
|
||||
if len(exts) > 0 {
|
||||
scan.b = append(scan.b, bytes.Join(exts, separator)...)
|
||||
} else if start > 0 {
|
||||
// Strip trailing '-'.
|
||||
scan.b = scan.b[:start-1]
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
// parseExtension parses a single extension and returns the position of
|
||||
// the extension end.
|
||||
func parseExtension(scan *scanner) int {
|
||||
start, end := scan.start, scan.end
|
||||
switch scan.token[0] {
|
||||
case 'u': // https://www.ietf.org/rfc/rfc6067.txt
|
||||
attrStart := end
|
||||
scan.scan()
|
||||
for last := []byte{}; len(scan.token) > 2; scan.scan() {
|
||||
if bytes.Compare(scan.token, last) != -1 {
|
||||
// Attributes are unsorted. Start over from scratch.
|
||||
p := attrStart + 1
|
||||
scan.next = p
|
||||
attrs := [][]byte{}
|
||||
for scan.scan(); len(scan.token) > 2; scan.scan() {
|
||||
attrs = append(attrs, scan.token)
|
||||
end = scan.end
|
||||
}
|
||||
sort.Sort(bytesSort{attrs, 3})
|
||||
copy(scan.b[p:], bytes.Join(attrs, separator))
|
||||
break
|
||||
}
|
||||
last = scan.token
|
||||
end = scan.end
|
||||
}
|
||||
// Scan key-type sequences. A key is of length 2 and may be followed
|
||||
// by 0 or more "type" subtags from 3 to the maximum of 8 letters.
|
||||
var last, key []byte
|
||||
for attrEnd := end; len(scan.token) == 2; last = key {
|
||||
key = scan.token
|
||||
end = scan.end
|
||||
for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
|
||||
end = scan.end
|
||||
}
|
||||
// TODO: check key value validity
|
||||
if bytes.Compare(key, last) != 1 || scan.err != nil {
|
||||
// We have an invalid key or the keys are not sorted.
|
||||
// Start scanning keys from scratch and reorder.
|
||||
p := attrEnd + 1
|
||||
scan.next = p
|
||||
keys := [][]byte{}
|
||||
for scan.scan(); len(scan.token) == 2; {
|
||||
keyStart := scan.start
|
||||
end = scan.end
|
||||
for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
|
||||
end = scan.end
|
||||
}
|
||||
keys = append(keys, scan.b[keyStart:end])
|
||||
}
|
||||
sort.Stable(bytesSort{keys, 2})
|
||||
if n := len(keys); n > 0 {
|
||||
k := 0
|
||||
for i := 1; i < n; i++ {
|
||||
if !bytes.Equal(keys[k][:2], keys[i][:2]) {
|
||||
k++
|
||||
keys[k] = keys[i]
|
||||
} else if !bytes.Equal(keys[k], keys[i]) {
|
||||
scan.setError(ErrDuplicateKey)
|
||||
}
|
||||
}
|
||||
keys = keys[:k+1]
|
||||
}
|
||||
reordered := bytes.Join(keys, separator)
|
||||
if e := p + len(reordered); e < end {
|
||||
scan.deleteRange(e, end)
|
||||
end = e
|
||||
}
|
||||
copy(scan.b[p:], reordered)
|
||||
break
|
||||
}
|
||||
}
|
||||
case 't': // https://www.ietf.org/rfc/rfc6497.txt
|
||||
scan.scan()
|
||||
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
|
||||
_, end = parseTag(scan, false)
|
||||
scan.toLower(start, end)
|
||||
}
|
||||
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
|
||||
end = scan.acceptMinSize(3)
|
||||
}
|
||||
case 'x':
|
||||
end = scan.acceptMinSize(1)
|
||||
default:
|
||||
end = scan.acceptMinSize(2)
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
// getExtension returns the name, body and end position of the extension.
|
||||
func getExtension(s string, p int) (end int, ext string) {
|
||||
if s[p] == '-' {
|
||||
p++
|
||||
}
|
||||
if s[p] == 'x' {
|
||||
return len(s), s[p:]
|
||||
}
|
||||
end = nextExtension(s, p)
|
||||
return end, s[p:end]
|
||||
}
|
||||
|
||||
// nextExtension finds the next extension within the string, searching
|
||||
// for the -<char>- pattern from position p.
|
||||
// In the fast majority of cases, language tags will have at most
|
||||
// one extension and extensions tend to be small.
|
||||
func nextExtension(s string, p int) int {
|
||||
for n := len(s) - 3; p < n; {
|
||||
if s[p] == '-' {
|
||||
if s[p+2] == '-' {
|
||||
return p
|
||||
}
|
||||
p += 3
|
||||
} else {
|
||||
p++
|
||||
}
|
||||
}
|
||||
return len(s)
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,48 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
|
||||
// It simplifies safe initialization of Tag values.
|
||||
func MustParse(s string) Tag {
|
||||
t, err := Parse(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
// MustParseBase is like ParseBase, but panics if the given base cannot be parsed.
|
||||
// It simplifies safe initialization of Base values.
|
||||
func MustParseBase(s string) Language {
|
||||
b, err := ParseBase(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// MustParseScript is like ParseScript, but panics if the given script cannot be
|
||||
// parsed. It simplifies safe initialization of Script values.
|
||||
func MustParseScript(s string) Script {
|
||||
scr, err := ParseScript(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return scr
|
||||
}
|
||||
|
||||
// MustParseRegion is like ParseRegion, but panics if the given region cannot be
|
||||
// parsed. It simplifies safe initialization of Region values.
|
||||
func MustParseRegion(s string) Region {
|
||||
r, err := ParseRegion(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// Und is the root language.
|
||||
var Und Tag
|
|
@ -0,0 +1,67 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package internal
|
||||
|
||||
// This file contains matchers that implement CLDR inheritance.
|
||||
//
|
||||
// See https://unicode.org/reports/tr35/#Locale_Inheritance.
|
||||
//
|
||||
// Some of the inheritance described in this document is already handled by
|
||||
// the cldr package.
|
||||
|
||||
import (
|
||||
"golang.org/x/text/language"
|
||||
)
|
||||
|
||||
// TODO: consider if (some of the) matching algorithm needs to be public after
|
||||
// getting some feel about what is generic and what is specific.
|
||||
|
||||
// NewInheritanceMatcher returns a matcher that matches based on the inheritance
|
||||
// chain.
|
||||
//
|
||||
// The matcher uses canonicalization and the parent relationship to find a
|
||||
// match. The resulting match will always be either Und or a language with the
|
||||
// same language and script as the requested language. It will not match
|
||||
// languages for which there is understood to be mutual or one-directional
|
||||
// intelligibility.
|
||||
//
|
||||
// A Match will indicate an Exact match if the language matches after
|
||||
// canonicalization and High if the matched tag is a parent.
|
||||
func NewInheritanceMatcher(t []language.Tag) *InheritanceMatcher {
|
||||
tags := &InheritanceMatcher{make(map[language.Tag]int)}
|
||||
for i, tag := range t {
|
||||
ct, err := language.All.Canonicalize(tag)
|
||||
if err != nil {
|
||||
ct = tag
|
||||
}
|
||||
tags.index[ct] = i
|
||||
}
|
||||
return tags
|
||||
}
|
||||
|
||||
type InheritanceMatcher struct {
|
||||
index map[language.Tag]int
|
||||
}
|
||||
|
||||
func (m InheritanceMatcher) Match(want ...language.Tag) (language.Tag, int, language.Confidence) {
|
||||
for _, t := range want {
|
||||
ct, err := language.All.Canonicalize(t)
|
||||
if err != nil {
|
||||
ct = t
|
||||
}
|
||||
conf := language.Exact
|
||||
for {
|
||||
if index, ok := m.index[ct]; ok {
|
||||
return ct, index, conf
|
||||
}
|
||||
if ct == language.Und {
|
||||
break
|
||||
}
|
||||
ct = ct.Parent()
|
||||
conf = language.High
|
||||
}
|
||||
}
|
||||
return language.Und, 0, language.No
|
||||
}
|
|
@ -0,0 +1,100 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package tag contains functionality handling tags and related data.
|
||||
package tag // import "golang.org/x/text/internal/tag"
|
||||
|
||||
import "sort"
|
||||
|
||||
// An Index converts tags to a compact numeric value.
|
||||
//
|
||||
// All elements are of size 4. Tags may be up to 4 bytes long. Excess bytes can
|
||||
// be used to store additional information about the tag.
|
||||
type Index string
|
||||
|
||||
// Elem returns the element data at the given index.
|
||||
func (s Index) Elem(x int) string {
|
||||
return string(s[x*4 : x*4+4])
|
||||
}
|
||||
|
||||
// Index reports the index of the given key or -1 if it could not be found.
|
||||
// Only the first len(key) bytes from the start of the 4-byte entries will be
|
||||
// considered for the search and the first match in Index will be returned.
|
||||
func (s Index) Index(key []byte) int {
|
||||
n := len(key)
|
||||
// search the index of the first entry with an equal or higher value than
|
||||
// key in s.
|
||||
index := sort.Search(len(s)/4, func(i int) bool {
|
||||
return cmp(s[i*4:i*4+n], key) != -1
|
||||
})
|
||||
i := index * 4
|
||||
if cmp(s[i:i+len(key)], key) != 0 {
|
||||
return -1
|
||||
}
|
||||
return index
|
||||
}
|
||||
|
||||
// Next finds the next occurrence of key after index x, which must have been
|
||||
// obtained from a call to Index using the same key. It returns x+1 or -1.
|
||||
func (s Index) Next(key []byte, x int) int {
|
||||
if x++; x*4 < len(s) && cmp(s[x*4:x*4+len(key)], key) == 0 {
|
||||
return x
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// cmp returns an integer comparing a and b lexicographically.
|
||||
func cmp(a Index, b []byte) int {
|
||||
n := len(a)
|
||||
if len(b) < n {
|
||||
n = len(b)
|
||||
}
|
||||
for i, c := range b[:n] {
|
||||
switch {
|
||||
case a[i] > c:
|
||||
return 1
|
||||
case a[i] < c:
|
||||
return -1
|
||||
}
|
||||
}
|
||||
switch {
|
||||
case len(a) < len(b):
|
||||
return -1
|
||||
case len(a) > len(b):
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// Compare returns an integer comparing a and b lexicographically.
|
||||
func Compare(a string, b []byte) int {
|
||||
return cmp(Index(a), b)
|
||||
}
|
||||
|
||||
// FixCase reformats b to the same pattern of cases as form.
|
||||
// If returns false if string b is malformed.
|
||||
func FixCase(form string, b []byte) bool {
|
||||
if len(form) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i, c := range b {
|
||||
if form[i] <= 'Z' {
|
||||
if c >= 'a' {
|
||||
c -= 'z' - 'Z'
|
||||
}
|
||||
if c < 'A' || 'Z' < c {
|
||||
return false
|
||||
}
|
||||
} else {
|
||||
if c <= 'Z' {
|
||||
c += 'z' - 'Z'
|
||||
}
|
||||
if c < 'a' || 'z' < c {
|
||||
return false
|
||||
}
|
||||
}
|
||||
b[i] = c
|
||||
}
|
||||
return true
|
||||
}
|
|
@ -0,0 +1,187 @@
|
|||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"golang.org/x/text/internal/language"
|
||||
)
|
||||
|
||||
// The Coverage interface is used to define the level of coverage of an
|
||||
// internationalization service. Note that not all types are supported by all
|
||||
// services. As lists may be generated on the fly, it is recommended that users
|
||||
// of a Coverage cache the results.
|
||||
type Coverage interface {
|
||||
// Tags returns the list of supported tags.
|
||||
Tags() []Tag
|
||||
|
||||
// BaseLanguages returns the list of supported base languages.
|
||||
BaseLanguages() []Base
|
||||
|
||||
// Scripts returns the list of supported scripts.
|
||||
Scripts() []Script
|
||||
|
||||
// Regions returns the list of supported regions.
|
||||
Regions() []Region
|
||||
}
|
||||
|
||||
var (
|
||||
// Supported defines a Coverage that lists all supported subtags. Tags
|
||||
// always returns nil.
|
||||
Supported Coverage = allSubtags{}
|
||||
)
|
||||
|
||||
// TODO:
|
||||
// - Support Variants, numbering systems.
|
||||
// - CLDR coverage levels.
|
||||
// - Set of common tags defined in this package.
|
||||
|
||||
type allSubtags struct{}
|
||||
|
||||
// Regions returns the list of supported regions. As all regions are in a
|
||||
// consecutive range, it simply returns a slice of numbers in increasing order.
|
||||
// The "undefined" region is not returned.
|
||||
func (s allSubtags) Regions() []Region {
|
||||
reg := make([]Region, language.NumRegions)
|
||||
for i := range reg {
|
||||
reg[i] = Region{language.Region(i + 1)}
|
||||
}
|
||||
return reg
|
||||
}
|
||||
|
||||
// Scripts returns the list of supported scripts. As all scripts are in a
|
||||
// consecutive range, it simply returns a slice of numbers in increasing order.
|
||||
// The "undefined" script is not returned.
|
||||
func (s allSubtags) Scripts() []Script {
|
||||
scr := make([]Script, language.NumScripts)
|
||||
for i := range scr {
|
||||
scr[i] = Script{language.Script(i + 1)}
|
||||
}
|
||||
return scr
|
||||
}
|
||||
|
||||
// BaseLanguages returns the list of all supported base languages. It generates
|
||||
// the list by traversing the internal structures.
|
||||
func (s allSubtags) BaseLanguages() []Base {
|
||||
bs := language.BaseLanguages()
|
||||
base := make([]Base, len(bs))
|
||||
for i, b := range bs {
|
||||
base[i] = Base{b}
|
||||
}
|
||||
return base
|
||||
}
|
||||
|
||||
// Tags always returns nil.
|
||||
func (s allSubtags) Tags() []Tag {
|
||||
return nil
|
||||
}
|
||||
|
||||
// coverage is used by NewCoverage which is used as a convenient way for
|
||||
// creating Coverage implementations for partially defined data. Very often a
|
||||
// package will only need to define a subset of slices. coverage provides a
|
||||
// convenient way to do this. Moreover, packages using NewCoverage, instead of
|
||||
// their own implementation, will not break if later new slice types are added.
|
||||
type coverage struct {
|
||||
tags func() []Tag
|
||||
bases func() []Base
|
||||
scripts func() []Script
|
||||
regions func() []Region
|
||||
}
|
||||
|
||||
func (s *coverage) Tags() []Tag {
|
||||
if s.tags == nil {
|
||||
return nil
|
||||
}
|
||||
return s.tags()
|
||||
}
|
||||
|
||||
// bases implements sort.Interface and is used to sort base languages.
|
||||
type bases []Base
|
||||
|
||||
func (b bases) Len() int {
|
||||
return len(b)
|
||||
}
|
||||
|
||||
func (b bases) Swap(i, j int) {
|
||||
b[i], b[j] = b[j], b[i]
|
||||
}
|
||||
|
||||
func (b bases) Less(i, j int) bool {
|
||||
return b[i].langID < b[j].langID
|
||||
}
|
||||
|
||||
// BaseLanguages returns the result from calling s.bases if it is specified or
|
||||
// otherwise derives the set of supported base languages from tags.
|
||||
func (s *coverage) BaseLanguages() []Base {
|
||||
if s.bases == nil {
|
||||
tags := s.Tags()
|
||||
if len(tags) == 0 {
|
||||
return nil
|
||||
}
|
||||
a := make([]Base, len(tags))
|
||||
for i, t := range tags {
|
||||
a[i] = Base{language.Language(t.lang())}
|
||||
}
|
||||
sort.Sort(bases(a))
|
||||
k := 0
|
||||
for i := 1; i < len(a); i++ {
|
||||
if a[k] != a[i] {
|
||||
k++
|
||||
a[k] = a[i]
|
||||
}
|
||||
}
|
||||
return a[:k+1]
|
||||
}
|
||||
return s.bases()
|
||||
}
|
||||
|
||||
func (s *coverage) Scripts() []Script {
|
||||
if s.scripts == nil {
|
||||
return nil
|
||||
}
|
||||
return s.scripts()
|
||||
}
|
||||
|
||||
func (s *coverage) Regions() []Region {
|
||||
if s.regions == nil {
|
||||
return nil
|
||||
}
|
||||
return s.regions()
|
||||
}
|
||||
|
||||
// NewCoverage returns a Coverage for the given lists. It is typically used by
|
||||
// packages providing internationalization services to define their level of
|
||||
// coverage. A list may be of type []T or func() []T, where T is either Tag,
|
||||
// Base, Script or Region. The returned Coverage derives the value for Bases
|
||||
// from Tags if no func or slice for []Base is specified. For other unspecified
|
||||
// types the returned Coverage will return nil for the respective methods.
|
||||
func NewCoverage(list ...interface{}) Coverage {
|
||||
s := &coverage{}
|
||||
for _, x := range list {
|
||||
switch v := x.(type) {
|
||||
case func() []Base:
|
||||
s.bases = v
|
||||
case func() []Script:
|
||||
s.scripts = v
|
||||
case func() []Region:
|
||||
s.regions = v
|
||||
case func() []Tag:
|
||||
s.tags = v
|
||||
case []Base:
|
||||
s.bases = func() []Base { return v }
|
||||
case []Script:
|
||||
s.scripts = func() []Script { return v }
|
||||
case []Region:
|
||||
s.regions = func() []Region { return v }
|
||||
case []Tag:
|
||||
s.tags = func() []Tag { return v }
|
||||
default:
|
||||
panic(fmt.Sprintf("language: unsupported set type %T", v))
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package language implements BCP 47 language tags and related functionality.
|
||||
//
|
||||
// The most important function of package language is to match a list of
|
||||
// user-preferred languages to a list of supported languages.
|
||||
// It alleviates the developer of dealing with the complexity of this process
|
||||
// and provides the user with the best experience
|
||||
// (see https://blog.golang.org/matchlang).
|
||||
//
|
||||
// # Matching preferred against supported languages
|
||||
//
|
||||
// A Matcher for an application that supports English, Australian English,
|
||||
// Danish, and standard Mandarin can be created as follows:
|
||||
//
|
||||
// var matcher = language.NewMatcher([]language.Tag{
|
||||
// language.English, // The first language is used as fallback.
|
||||
// language.MustParse("en-AU"),
|
||||
// language.Danish,
|
||||
// language.Chinese,
|
||||
// })
|
||||
//
|
||||
// This list of supported languages is typically implied by the languages for
|
||||
// which there exists translations of the user interface.
|
||||
//
|
||||
// User-preferred languages usually come as a comma-separated list of BCP 47
|
||||
// language tags.
|
||||
// The MatchString finds best matches for such strings:
|
||||
//
|
||||
// handler(w http.ResponseWriter, r *http.Request) {
|
||||
// lang, _ := r.Cookie("lang")
|
||||
// accept := r.Header.Get("Accept-Language")
|
||||
// tag, _ := language.MatchStrings(matcher, lang.String(), accept)
|
||||
//
|
||||
// // tag should now be used for the initialization of any
|
||||
// // locale-specific service.
|
||||
// }
|
||||
//
|
||||
// The Matcher's Match method can be used to match Tags directly.
|
||||
//
|
||||
// Matchers are aware of the intricacies of equivalence between languages, such
|
||||
// as deprecated subtags, legacy tags, macro languages, mutual
|
||||
// intelligibility between scripts and languages, and transparently passing
|
||||
// BCP 47 user configuration.
|
||||
// For instance, it will know that a reader of Bokmål Danish can read Norwegian
|
||||
// and will know that Cantonese ("yue") is a good match for "zh-HK".
|
||||
//
|
||||
// # Using match results
|
||||
//
|
||||
// To guarantee a consistent user experience to the user it is important to
|
||||
// use the same language tag for the selection of any locale-specific services.
|
||||
// For example, it is utterly confusing to substitute spelled-out numbers
|
||||
// or dates in one language in text of another language.
|
||||
// More subtly confusing is using the wrong sorting order or casing
|
||||
// algorithm for a certain language.
|
||||
//
|
||||
// All the packages in x/text that provide locale-specific services
|
||||
// (e.g. collate, cases) should be initialized with the tag that was
|
||||
// obtained at the start of an interaction with the user.
|
||||
//
|
||||
// Note that Tag that is returned by Match and MatchString may differ from any
|
||||
// of the supported languages, as it may contain carried over settings from
|
||||
// the user tags.
|
||||
// This may be inconvenient when your application has some additional
|
||||
// locale-specific data for your supported languages.
|
||||
// Match and MatchString both return the index of the matched supported tag
|
||||
// to simplify associating such data with the matched tag.
|
||||
//
|
||||
// # Canonicalization
|
||||
//
|
||||
// If one uses the Matcher to compare languages one does not need to
|
||||
// worry about canonicalization.
|
||||
//
|
||||
// The meaning of a Tag varies per application. The language package
|
||||
// therefore delays canonicalization and preserves information as much
|
||||
// as possible. The Matcher, however, will always take into account that
|
||||
// two different tags may represent the same language.
|
||||
//
|
||||
// By default, only legacy and deprecated tags are converted into their
|
||||
// canonical equivalent. All other information is preserved. This approach makes
|
||||
// the confidence scores more accurate and allows matchers to distinguish
|
||||
// between variants that are otherwise lost.
|
||||
//
|
||||
// As a consequence, two tags that should be treated as identical according to
|
||||
// BCP 47 or CLDR, like "en-Latn" and "en", will be represented differently. The
|
||||
// Matcher handles such distinctions, though, and is aware of the
|
||||
// equivalence relations. The CanonType type can be used to alter the
|
||||
// canonicalization form.
|
||||
//
|
||||
// # References
|
||||
//
|
||||
// BCP 47 - Tags for Identifying Languages http://tools.ietf.org/html/bcp47
|
||||
package language // import "golang.org/x/text/language"
|
||||
|
||||
// TODO: explanation on how to match languages for your own locale-specific
|
||||
// service.
|
|
@ -0,0 +1,605 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run gen.go -output tables.go
|
||||
|
||||
package language
|
||||
|
||||
// TODO: Remove above NOTE after:
|
||||
// - verifying that tables are dropped correctly (most notably matcher tables).
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/language"
|
||||
"golang.org/x/text/internal/language/compact"
|
||||
)
|
||||
|
||||
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
|
||||
// specific language or locale. All language tag values are guaranteed to be
|
||||
// well-formed.
|
||||
type Tag compact.Tag
|
||||
|
||||
func makeTag(t language.Tag) (tag Tag) {
|
||||
return Tag(compact.Make(t))
|
||||
}
|
||||
|
||||
func (t *Tag) tag() language.Tag {
|
||||
return (*compact.Tag)(t).Tag()
|
||||
}
|
||||
|
||||
func (t *Tag) isCompact() bool {
|
||||
return (*compact.Tag)(t).IsCompact()
|
||||
}
|
||||
|
||||
// TODO: improve performance.
|
||||
func (t *Tag) lang() language.Language { return t.tag().LangID }
|
||||
func (t *Tag) region() language.Region { return t.tag().RegionID }
|
||||
func (t *Tag) script() language.Script { return t.tag().ScriptID }
|
||||
|
||||
// Make is a convenience wrapper for Parse that omits the error.
|
||||
// In case of an error, a sensible default is returned.
|
||||
func Make(s string) Tag {
|
||||
return Default.Make(s)
|
||||
}
|
||||
|
||||
// Make is a convenience wrapper for c.Parse that omits the error.
|
||||
// In case of an error, a sensible default is returned.
|
||||
func (c CanonType) Make(s string) Tag {
|
||||
t, _ := c.Parse(s)
|
||||
return t
|
||||
}
|
||||
|
||||
// Raw returns the raw base language, script and region, without making an
|
||||
// attempt to infer their values.
|
||||
func (t Tag) Raw() (b Base, s Script, r Region) {
|
||||
tt := t.tag()
|
||||
return Base{tt.LangID}, Script{tt.ScriptID}, Region{tt.RegionID}
|
||||
}
|
||||
|
||||
// IsRoot returns true if t is equal to language "und".
|
||||
func (t Tag) IsRoot() bool {
|
||||
return compact.Tag(t).IsRoot()
|
||||
}
|
||||
|
||||
// CanonType can be used to enable or disable various types of canonicalization.
|
||||
type CanonType int
|
||||
|
||||
const (
|
||||
// Replace deprecated base languages with their preferred replacements.
|
||||
DeprecatedBase CanonType = 1 << iota
|
||||
// Replace deprecated scripts with their preferred replacements.
|
||||
DeprecatedScript
|
||||
// Replace deprecated regions with their preferred replacements.
|
||||
DeprecatedRegion
|
||||
// Remove redundant scripts.
|
||||
SuppressScript
|
||||
// Normalize legacy encodings. This includes legacy languages defined in
|
||||
// CLDR as well as bibliographic codes defined in ISO-639.
|
||||
Legacy
|
||||
// Map the dominant language of a macro language group to the macro language
|
||||
// subtag. For example cmn -> zh.
|
||||
Macro
|
||||
// The CLDR flag should be used if full compatibility with CLDR is required.
|
||||
// There are a few cases where language.Tag may differ from CLDR. To follow all
|
||||
// of CLDR's suggestions, use All|CLDR.
|
||||
CLDR
|
||||
|
||||
// Raw can be used to Compose or Parse without Canonicalization.
|
||||
Raw CanonType = 0
|
||||
|
||||
// Replace all deprecated tags with their preferred replacements.
|
||||
Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion
|
||||
|
||||
// All canonicalizations recommended by BCP 47.
|
||||
BCP47 = Deprecated | SuppressScript
|
||||
|
||||
// All canonicalizations.
|
||||
All = BCP47 | Legacy | Macro
|
||||
|
||||
// Default is the canonicalization used by Parse, Make and Compose. To
|
||||
// preserve as much information as possible, canonicalizations that remove
|
||||
// potentially valuable information are not included. The Matcher is
|
||||
// designed to recognize similar tags that would be the same if
|
||||
// they were canonicalized using All.
|
||||
Default = Deprecated | Legacy
|
||||
|
||||
canonLang = DeprecatedBase | Legacy | Macro
|
||||
|
||||
// TODO: LikelyScript, LikelyRegion: suppress similar to ICU.
|
||||
)
|
||||
|
||||
// canonicalize returns the canonicalized equivalent of the tag and
|
||||
// whether there was any change.
|
||||
func canonicalize(c CanonType, t language.Tag) (language.Tag, bool) {
|
||||
if c == Raw {
|
||||
return t, false
|
||||
}
|
||||
changed := false
|
||||
if c&SuppressScript != 0 {
|
||||
if t.LangID.SuppressScript() == t.ScriptID {
|
||||
t.ScriptID = 0
|
||||
changed = true
|
||||
}
|
||||
}
|
||||
if c&canonLang != 0 {
|
||||
for {
|
||||
if l, aliasType := t.LangID.Canonicalize(); l != t.LangID {
|
||||
switch aliasType {
|
||||
case language.Legacy:
|
||||
if c&Legacy != 0 {
|
||||
if t.LangID == _sh && t.ScriptID == 0 {
|
||||
t.ScriptID = _Latn
|
||||
}
|
||||
t.LangID = l
|
||||
changed = true
|
||||
}
|
||||
case language.Macro:
|
||||
if c&Macro != 0 {
|
||||
// We deviate here from CLDR. The mapping "nb" -> "no"
|
||||
// qualifies as a typical Macro language mapping. However,
|
||||
// for legacy reasons, CLDR maps "no", the macro language
|
||||
// code for Norwegian, to the dominant variant "nb". This
|
||||
// change is currently under consideration for CLDR as well.
|
||||
// See https://unicode.org/cldr/trac/ticket/2698 and also
|
||||
// https://unicode.org/cldr/trac/ticket/1790 for some of the
|
||||
// practical implications. TODO: this check could be removed
|
||||
// if CLDR adopts this change.
|
||||
if c&CLDR == 0 || t.LangID != _nb {
|
||||
changed = true
|
||||
t.LangID = l
|
||||
}
|
||||
}
|
||||
case language.Deprecated:
|
||||
if c&DeprecatedBase != 0 {
|
||||
if t.LangID == _mo && t.RegionID == 0 {
|
||||
t.RegionID = _MD
|
||||
}
|
||||
t.LangID = l
|
||||
changed = true
|
||||
// Other canonicalization types may still apply.
|
||||
continue
|
||||
}
|
||||
}
|
||||
} else if c&Legacy != 0 && t.LangID == _no && c&CLDR != 0 {
|
||||
t.LangID = _nb
|
||||
changed = true
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if c&DeprecatedScript != 0 {
|
||||
if t.ScriptID == _Qaai {
|
||||
changed = true
|
||||
t.ScriptID = _Zinh
|
||||
}
|
||||
}
|
||||
if c&DeprecatedRegion != 0 {
|
||||
if r := t.RegionID.Canonicalize(); r != t.RegionID {
|
||||
changed = true
|
||||
t.RegionID = r
|
||||
}
|
||||
}
|
||||
return t, changed
|
||||
}
|
||||
|
||||
// Canonicalize returns the canonicalized equivalent of the tag.
|
||||
func (c CanonType) Canonicalize(t Tag) (Tag, error) {
|
||||
// First try fast path.
|
||||
if t.isCompact() {
|
||||
if _, changed := canonicalize(c, compact.Tag(t).Tag()); !changed {
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
// It is unlikely that one will canonicalize a tag after matching. So do
|
||||
// a slow but simple approach here.
|
||||
if tag, changed := canonicalize(c, t.tag()); changed {
|
||||
tag.RemakeString()
|
||||
return makeTag(tag), nil
|
||||
}
|
||||
return t, nil
|
||||
|
||||
}
|
||||
|
||||
// Confidence indicates the level of certainty for a given return value.
|
||||
// For example, Serbian may be written in Cyrillic or Latin script.
|
||||
// The confidence level indicates whether a value was explicitly specified,
|
||||
// whether it is typically the only possible value, or whether there is
|
||||
// an ambiguity.
|
||||
type Confidence int
|
||||
|
||||
const (
|
||||
No Confidence = iota // full confidence that there was no match
|
||||
Low // most likely value picked out of a set of alternatives
|
||||
High // value is generally assumed to be the correct match
|
||||
Exact // exact match or explicitly specified value
|
||||
)
|
||||
|
||||
var confName = []string{"No", "Low", "High", "Exact"}
|
||||
|
||||
func (c Confidence) String() string {
|
||||
return confName[c]
|
||||
}
|
||||
|
||||
// String returns the canonical string representation of the language tag.
|
||||
func (t Tag) String() string {
|
||||
return t.tag().String()
|
||||
}
|
||||
|
||||
// MarshalText implements encoding.TextMarshaler.
|
||||
func (t Tag) MarshalText() (text []byte, err error) {
|
||||
return t.tag().MarshalText()
|
||||
}
|
||||
|
||||
// UnmarshalText implements encoding.TextUnmarshaler.
|
||||
func (t *Tag) UnmarshalText(text []byte) error {
|
||||
var tag language.Tag
|
||||
err := tag.UnmarshalText(text)
|
||||
*t = makeTag(tag)
|
||||
return err
|
||||
}
|
||||
|
||||
// Base returns the base language of the language tag. If the base language is
|
||||
// unspecified, an attempt will be made to infer it from the context.
|
||||
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
|
||||
func (t Tag) Base() (Base, Confidence) {
|
||||
if b := t.lang(); b != 0 {
|
||||
return Base{b}, Exact
|
||||
}
|
||||
tt := t.tag()
|
||||
c := High
|
||||
if tt.ScriptID == 0 && !tt.RegionID.IsCountry() {
|
||||
c = Low
|
||||
}
|
||||
if tag, err := tt.Maximize(); err == nil && tag.LangID != 0 {
|
||||
return Base{tag.LangID}, c
|
||||
}
|
||||
return Base{0}, No
|
||||
}
|
||||
|
||||
// Script infers the script for the language tag. If it was not explicitly given, it will infer
|
||||
// a most likely candidate.
|
||||
// If more than one script is commonly used for a language, the most likely one
|
||||
// is returned with a low confidence indication. For example, it returns (Cyrl, Low)
|
||||
// for Serbian.
|
||||
// If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
|
||||
// as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
|
||||
// common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
|
||||
// See https://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
|
||||
// unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.
|
||||
// Note that an inferred script is never guaranteed to be the correct one. Latin is
|
||||
// almost exclusively used for Afrikaans, but Arabic has been used for some texts
|
||||
// in the past. Also, the script that is commonly used may change over time.
|
||||
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
|
||||
func (t Tag) Script() (Script, Confidence) {
|
||||
if scr := t.script(); scr != 0 {
|
||||
return Script{scr}, Exact
|
||||
}
|
||||
tt := t.tag()
|
||||
sc, c := language.Script(_Zzzz), No
|
||||
if scr := tt.LangID.SuppressScript(); scr != 0 {
|
||||
// Note: it is not always the case that a language with a suppress
|
||||
// script value is only written in one script (e.g. kk, ms, pa).
|
||||
if tt.RegionID == 0 {
|
||||
return Script{scr}, High
|
||||
}
|
||||
sc, c = scr, High
|
||||
}
|
||||
if tag, err := tt.Maximize(); err == nil {
|
||||
if tag.ScriptID != sc {
|
||||
sc, c = tag.ScriptID, Low
|
||||
}
|
||||
} else {
|
||||
tt, _ = canonicalize(Deprecated|Macro, tt)
|
||||
if tag, err := tt.Maximize(); err == nil && tag.ScriptID != sc {
|
||||
sc, c = tag.ScriptID, Low
|
||||
}
|
||||
}
|
||||
return Script{sc}, c
|
||||
}
|
||||
|
||||
// Region returns the region for the language tag. If it was not explicitly given, it will
|
||||
// infer a most likely candidate from the context.
|
||||
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
|
||||
func (t Tag) Region() (Region, Confidence) {
|
||||
if r := t.region(); r != 0 {
|
||||
return Region{r}, Exact
|
||||
}
|
||||
tt := t.tag()
|
||||
if tt, err := tt.Maximize(); err == nil {
|
||||
return Region{tt.RegionID}, Low // TODO: differentiate between high and low.
|
||||
}
|
||||
tt, _ = canonicalize(Deprecated|Macro, tt)
|
||||
if tag, err := tt.Maximize(); err == nil {
|
||||
return Region{tag.RegionID}, Low
|
||||
}
|
||||
return Region{_ZZ}, No // TODO: return world instead of undetermined?
|
||||
}
|
||||
|
||||
// Variants returns the variants specified explicitly for this language tag.
|
||||
// or nil if no variant was specified.
|
||||
func (t Tag) Variants() []Variant {
|
||||
if !compact.Tag(t).MayHaveVariants() {
|
||||
return nil
|
||||
}
|
||||
v := []Variant{}
|
||||
x, str := "", t.tag().Variants()
|
||||
for str != "" {
|
||||
x, str = nextToken(str)
|
||||
v = append(v, Variant{x})
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
|
||||
// specific language are substituted with fields from the parent language.
|
||||
// The parent for a language may change for newer versions of CLDR.
|
||||
//
|
||||
// Parent returns a tag for a less specific language that is mutually
|
||||
// intelligible or Und if there is no such language. This may not be the same as
|
||||
// simply stripping the last BCP 47 subtag. For instance, the parent of "zh-TW"
|
||||
// is "zh-Hant", and the parent of "zh-Hant" is "und".
|
||||
func (t Tag) Parent() Tag {
|
||||
return Tag(compact.Tag(t).Parent())
|
||||
}
|
||||
|
||||
// nextToken returns token t and the rest of the string.
|
||||
func nextToken(s string) (t, tail string) {
|
||||
p := strings.Index(s[1:], "-")
|
||||
if p == -1 {
|
||||
return s[1:], ""
|
||||
}
|
||||
p++
|
||||
return s[1:p], s[p:]
|
||||
}
|
||||
|
||||
// Extension is a single BCP 47 extension.
|
||||
type Extension struct {
|
||||
s string
|
||||
}
|
||||
|
||||
// String returns the string representation of the extension, including the
|
||||
// type tag.
|
||||
func (e Extension) String() string {
|
||||
return e.s
|
||||
}
|
||||
|
||||
// ParseExtension parses s as an extension and returns it on success.
|
||||
func ParseExtension(s string) (e Extension, err error) {
|
||||
ext, err := language.ParseExtension(s)
|
||||
return Extension{ext}, err
|
||||
}
|
||||
|
||||
// Type returns the one-byte extension type of e. It returns 0 for the zero
|
||||
// exception.
|
||||
func (e Extension) Type() byte {
|
||||
if e.s == "" {
|
||||
return 0
|
||||
}
|
||||
return e.s[0]
|
||||
}
|
||||
|
||||
// Tokens returns the list of tokens of e.
|
||||
func (e Extension) Tokens() []string {
|
||||
return strings.Split(e.s, "-")
|
||||
}
|
||||
|
||||
// Extension returns the extension of type x for tag t. It will return
|
||||
// false for ok if t does not have the requested extension. The returned
|
||||
// extension will be invalid in this case.
|
||||
func (t Tag) Extension(x byte) (ext Extension, ok bool) {
|
||||
if !compact.Tag(t).MayHaveExtensions() {
|
||||
return Extension{}, false
|
||||
}
|
||||
e, ok := t.tag().Extension(x)
|
||||
return Extension{e}, ok
|
||||
}
|
||||
|
||||
// Extensions returns all extensions of t.
|
||||
func (t Tag) Extensions() []Extension {
|
||||
if !compact.Tag(t).MayHaveExtensions() {
|
||||
return nil
|
||||
}
|
||||
e := []Extension{}
|
||||
for _, ext := range t.tag().Extensions() {
|
||||
e = append(e, Extension{ext})
|
||||
}
|
||||
return e
|
||||
}
|
||||
|
||||
// TypeForKey returns the type associated with the given key, where key and type
|
||||
// are of the allowed values defined for the Unicode locale extension ('u') in
|
||||
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// TypeForKey will traverse the inheritance chain to get the correct value.
|
||||
//
|
||||
// If there are multiple types associated with a key, only the first will be
|
||||
// returned. If there is no type associated with a key, it returns the empty
|
||||
// string.
|
||||
func (t Tag) TypeForKey(key string) string {
|
||||
if !compact.Tag(t).MayHaveExtensions() {
|
||||
if key != "rg" && key != "va" {
|
||||
return ""
|
||||
}
|
||||
}
|
||||
return t.tag().TypeForKey(key)
|
||||
}
|
||||
|
||||
// SetTypeForKey returns a new Tag with the key set to type, where key and type
|
||||
// are of the allowed values defined for the Unicode locale extension ('u') in
|
||||
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// An empty value removes an existing pair with the same key.
|
||||
func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
|
||||
tt, err := t.tag().SetTypeForKey(key, value)
|
||||
return makeTag(tt), err
|
||||
}
|
||||
|
||||
// NumCompactTags is the number of compact tags. The maximum tag is
|
||||
// NumCompactTags-1.
|
||||
const NumCompactTags = compact.NumCompactTags
|
||||
|
||||
// CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
|
||||
// for which data exists in the text repository.The index will change over time
|
||||
// and should not be stored in persistent storage. If t does not match a compact
|
||||
// index, exact will be false and the compact index will be returned for the
|
||||
// first match after repeatedly taking the Parent of t.
|
||||
func CompactIndex(t Tag) (index int, exact bool) {
|
||||
id, exact := compact.LanguageID(compact.Tag(t))
|
||||
return int(id), exact
|
||||
}
|
||||
|
||||
var root = language.Tag{}
|
||||
|
||||
// Base is an ISO 639 language code, used for encoding the base language
|
||||
// of a language tag.
|
||||
type Base struct {
|
||||
langID language.Language
|
||||
}
|
||||
|
||||
// ParseBase parses a 2- or 3-letter ISO 639 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown language identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseBase(s string) (Base, error) {
|
||||
l, err := language.ParseBase(s)
|
||||
return Base{l}, err
|
||||
}
|
||||
|
||||
// String returns the BCP 47 representation of the base language.
|
||||
func (b Base) String() string {
|
||||
return b.langID.String()
|
||||
}
|
||||
|
||||
// ISO3 returns the ISO 639-3 language code.
|
||||
func (b Base) ISO3() string {
|
||||
return b.langID.ISO3()
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether this language code is reserved for private use.
|
||||
func (b Base) IsPrivateUse() bool {
|
||||
return b.langID.IsPrivateUse()
|
||||
}
|
||||
|
||||
// Script is a 4-letter ISO 15924 code for representing scripts.
|
||||
// It is idiomatically represented in title case.
|
||||
type Script struct {
|
||||
scriptID language.Script
|
||||
}
|
||||
|
||||
// ParseScript parses a 4-letter ISO 15924 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown script identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseScript(s string) (Script, error) {
|
||||
sc, err := language.ParseScript(s)
|
||||
return Script{sc}, err
|
||||
}
|
||||
|
||||
// String returns the script code in title case.
|
||||
// It returns "Zzzz" for an unspecified script.
|
||||
func (s Script) String() string {
|
||||
return s.scriptID.String()
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether this script code is reserved for private use.
|
||||
func (s Script) IsPrivateUse() bool {
|
||||
return s.scriptID.IsPrivateUse()
|
||||
}
|
||||
|
||||
// Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
|
||||
type Region struct {
|
||||
regionID language.Region
|
||||
}
|
||||
|
||||
// EncodeM49 returns the Region for the given UN M.49 code.
|
||||
// It returns an error if r is not a valid code.
|
||||
func EncodeM49(r int) (Region, error) {
|
||||
rid, err := language.EncodeM49(r)
|
||||
return Region{rid}, err
|
||||
}
|
||||
|
||||
// ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown region identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseRegion(s string) (Region, error) {
|
||||
r, err := language.ParseRegion(s)
|
||||
return Region{r}, err
|
||||
}
|
||||
|
||||
// String returns the BCP 47 representation for the region.
|
||||
// It returns "ZZ" for an unspecified region.
|
||||
func (r Region) String() string {
|
||||
return r.regionID.String()
|
||||
}
|
||||
|
||||
// ISO3 returns the 3-letter ISO code of r.
|
||||
// Note that not all regions have a 3-letter ISO code.
|
||||
// In such cases this method returns "ZZZ".
|
||||
func (r Region) ISO3() string {
|
||||
return r.regionID.ISO3()
|
||||
}
|
||||
|
||||
// M49 returns the UN M.49 encoding of r, or 0 if this encoding
|
||||
// is not defined for r.
|
||||
func (r Region) M49() int {
|
||||
return r.regionID.M49()
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
|
||||
// may include private-use tags that are assigned by CLDR and used in this
|
||||
// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
|
||||
func (r Region) IsPrivateUse() bool {
|
||||
return r.regionID.IsPrivateUse()
|
||||
}
|
||||
|
||||
// IsCountry returns whether this region is a country or autonomous area. This
|
||||
// includes non-standard definitions from CLDR.
|
||||
func (r Region) IsCountry() bool {
|
||||
return r.regionID.IsCountry()
|
||||
}
|
||||
|
||||
// IsGroup returns whether this region defines a collection of regions. This
|
||||
// includes non-standard definitions from CLDR.
|
||||
func (r Region) IsGroup() bool {
|
||||
return r.regionID.IsGroup()
|
||||
}
|
||||
|
||||
// Contains returns whether Region c is contained by Region r. It returns true
|
||||
// if c == r.
|
||||
func (r Region) Contains(c Region) bool {
|
||||
return r.regionID.Contains(c.regionID)
|
||||
}
|
||||
|
||||
// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
|
||||
// In all other cases it returns either the region itself or an error.
|
||||
//
|
||||
// This method may return an error for a region for which there exists a
|
||||
// canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
|
||||
// region will already be canonicalized it was obtained from a Tag that was
|
||||
// obtained using any of the default methods.
|
||||
func (r Region) TLD() (Region, error) {
|
||||
tld, err := r.regionID.TLD()
|
||||
return Region{tld}, err
|
||||
}
|
||||
|
||||
// Canonicalize returns the region or a possible replacement if the region is
|
||||
// deprecated. It will not return a replacement for deprecated regions that
|
||||
// are split into multiple regions.
|
||||
func (r Region) Canonicalize() Region {
|
||||
return Region{r.regionID.Canonicalize()}
|
||||
}
|
||||
|
||||
// Variant represents a registered variant of a language as defined by BCP 47.
|
||||
type Variant struct {
|
||||
variant string
|
||||
}
|
||||
|
||||
// ParseVariant parses and returns a Variant. An error is returned if s is not
|
||||
// a valid variant.
|
||||
func ParseVariant(s string) (Variant, error) {
|
||||
v, err := language.ParseVariant(s)
|
||||
return Variant{v.String()}, err
|
||||
}
|
||||
|
||||
// String returns the string representation of the variant.
|
||||
func (v Variant) String() string {
|
||||
return v.variant
|
||||
}
|
|
@ -0,0 +1,735 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/language"
|
||||
)
|
||||
|
||||
// A MatchOption configures a Matcher.
|
||||
type MatchOption func(*matcher)
|
||||
|
||||
// PreferSameScript will, in the absence of a match, result in the first
|
||||
// preferred tag with the same script as a supported tag to match this supported
|
||||
// tag. The default is currently true, but this may change in the future.
|
||||
func PreferSameScript(preferSame bool) MatchOption {
|
||||
return func(m *matcher) { m.preferSameScript = preferSame }
|
||||
}
|
||||
|
||||
// TODO(v1.0.0): consider making Matcher a concrete type, instead of interface.
|
||||
// There doesn't seem to be too much need for multiple types.
|
||||
// Making it a concrete type allows MatchStrings to be a method, which will
|
||||
// improve its discoverability.
|
||||
|
||||
// MatchStrings parses and matches the given strings until one of them matches
|
||||
// the language in the Matcher. A string may be an Accept-Language header as
|
||||
// handled by ParseAcceptLanguage. The default language is returned if no
|
||||
// other language matched.
|
||||
func MatchStrings(m Matcher, lang ...string) (tag Tag, index int) {
|
||||
for _, accept := range lang {
|
||||
desired, _, err := ParseAcceptLanguage(accept)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if tag, index, conf := m.Match(desired...); conf != No {
|
||||
return tag, index
|
||||
}
|
||||
}
|
||||
tag, index, _ = m.Match()
|
||||
return
|
||||
}
|
||||
|
||||
// Matcher is the interface that wraps the Match method.
|
||||
//
|
||||
// Match returns the best match for any of the given tags, along with
|
||||
// a unique index associated with the returned tag and a confidence
|
||||
// score.
|
||||
type Matcher interface {
|
||||
Match(t ...Tag) (tag Tag, index int, c Confidence)
|
||||
}
|
||||
|
||||
// Comprehends reports the confidence score for a speaker of a given language
|
||||
// to being able to comprehend the written form of an alternative language.
|
||||
func Comprehends(speaker, alternative Tag) Confidence {
|
||||
_, _, c := NewMatcher([]Tag{alternative}).Match(speaker)
|
||||
return c
|
||||
}
|
||||
|
||||
// NewMatcher returns a Matcher that matches an ordered list of preferred tags
|
||||
// against a list of supported tags based on written intelligibility, closeness
|
||||
// of dialect, equivalence of subtags and various other rules. It is initialized
|
||||
// with the list of supported tags. The first element is used as the default
|
||||
// value in case no match is found.
|
||||
//
|
||||
// Its Match method matches the first of the given Tags to reach a certain
|
||||
// confidence threshold. The tags passed to Match should therefore be specified
|
||||
// in order of preference. Extensions are ignored for matching.
|
||||
//
|
||||
// The index returned by the Match method corresponds to the index of the
|
||||
// matched tag in t, but is augmented with the Unicode extension ('u')of the
|
||||
// corresponding preferred tag. This allows user locale options to be passed
|
||||
// transparently.
|
||||
func NewMatcher(t []Tag, options ...MatchOption) Matcher {
|
||||
return newMatcher(t, options)
|
||||
}
|
||||
|
||||
func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
|
||||
var tt language.Tag
|
||||
match, w, c := m.getBest(want...)
|
||||
if match != nil {
|
||||
tt, index = match.tag, match.index
|
||||
} else {
|
||||
// TODO: this should be an option
|
||||
tt = m.default_.tag
|
||||
if m.preferSameScript {
|
||||
outer:
|
||||
for _, w := range want {
|
||||
script, _ := w.Script()
|
||||
if script.scriptID == 0 {
|
||||
// Don't do anything if there is no script, such as with
|
||||
// private subtags.
|
||||
continue
|
||||
}
|
||||
for i, h := range m.supported {
|
||||
if script.scriptID == h.maxScript {
|
||||
tt, index = h.tag, i
|
||||
break outer
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: select first language tag based on script.
|
||||
}
|
||||
if w.RegionID != tt.RegionID && w.RegionID != 0 {
|
||||
if w.RegionID != 0 && tt.RegionID != 0 && tt.RegionID.Contains(w.RegionID) {
|
||||
tt.RegionID = w.RegionID
|
||||
tt.RemakeString()
|
||||
} else if r := w.RegionID.String(); len(r) == 2 {
|
||||
// TODO: also filter macro and deprecated.
|
||||
tt, _ = tt.SetTypeForKey("rg", strings.ToLower(r)+"zzzz")
|
||||
}
|
||||
}
|
||||
// Copy options from the user-provided tag into the result tag. This is hard
|
||||
// to do after the fact, so we do it here.
|
||||
// TODO: add in alternative variants to -u-va-.
|
||||
// TODO: add preferred region to -u-rg-.
|
||||
if e := w.Extensions(); len(e) > 0 {
|
||||
b := language.Builder{}
|
||||
b.SetTag(tt)
|
||||
for _, e := range e {
|
||||
b.AddExt(e)
|
||||
}
|
||||
tt = b.Make()
|
||||
}
|
||||
return makeTag(tt), index, c
|
||||
}
|
||||
|
||||
// ErrMissingLikelyTagsData indicates no information was available
|
||||
// to compute likely values of missing tags.
|
||||
var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
|
||||
|
||||
// func (t *Tag) setTagsFrom(id Tag) {
|
||||
// t.LangID = id.LangID
|
||||
// t.ScriptID = id.ScriptID
|
||||
// t.RegionID = id.RegionID
|
||||
// }
|
||||
|
||||
// Tag Matching
|
||||
// CLDR defines an algorithm for finding the best match between two sets of language
|
||||
// tags. The basic algorithm defines how to score a possible match and then find
|
||||
// the match with the best score
|
||||
// (see https://www.unicode.org/reports/tr35/#LanguageMatching).
|
||||
// Using scoring has several disadvantages. The scoring obfuscates the importance of
|
||||
// the various factors considered, making the algorithm harder to understand. Using
|
||||
// scoring also requires the full score to be computed for each pair of tags.
|
||||
//
|
||||
// We will use a different algorithm which aims to have the following properties:
|
||||
// - clarity on the precedence of the various selection factors, and
|
||||
// - improved performance by allowing early termination of a comparison.
|
||||
//
|
||||
// Matching algorithm (overview)
|
||||
// Input:
|
||||
// - supported: a set of supported tags
|
||||
// - default: the default tag to return in case there is no match
|
||||
// - desired: list of desired tags, ordered by preference, starting with
|
||||
// the most-preferred.
|
||||
//
|
||||
// Algorithm:
|
||||
// 1) Set the best match to the lowest confidence level
|
||||
// 2) For each tag in "desired":
|
||||
// a) For each tag in "supported":
|
||||
// 1) compute the match between the two tags.
|
||||
// 2) if the match is better than the previous best match, replace it
|
||||
// with the new match. (see next section)
|
||||
// b) if the current best match is Exact and pin is true the result will be
|
||||
// frozen to the language found thusfar, although better matches may
|
||||
// still be found for the same language.
|
||||
// 3) If the best match so far is below a certain threshold, return "default".
|
||||
//
|
||||
// Ranking:
|
||||
// We use two phases to determine whether one pair of tags are a better match
|
||||
// than another pair of tags. First, we determine a rough confidence level. If the
|
||||
// levels are different, the one with the highest confidence wins.
|
||||
// Second, if the rough confidence levels are identical, we use a set of tie-breaker
|
||||
// rules.
|
||||
//
|
||||
// The confidence level of matching a pair of tags is determined by finding the
|
||||
// lowest confidence level of any matches of the corresponding subtags (the
|
||||
// result is deemed as good as its weakest link).
|
||||
// We define the following levels:
|
||||
// Exact - An exact match of a subtag, before adding likely subtags.
|
||||
// MaxExact - An exact match of a subtag, after adding likely subtags.
|
||||
// [See Note 2].
|
||||
// High - High level of mutual intelligibility between different subtag
|
||||
// variants.
|
||||
// Low - Low level of mutual intelligibility between different subtag
|
||||
// variants.
|
||||
// No - No mutual intelligibility.
|
||||
//
|
||||
// The following levels can occur for each type of subtag:
|
||||
// Base: Exact, MaxExact, High, Low, No
|
||||
// Script: Exact, MaxExact [see Note 3], Low, No
|
||||
// Region: Exact, MaxExact, High
|
||||
// Variant: Exact, High
|
||||
// Private: Exact, No
|
||||
//
|
||||
// Any result with a confidence level of Low or higher is deemed a possible match.
|
||||
// Once a desired tag matches any of the supported tags with a level of MaxExact
|
||||
// or higher, the next desired tag is not considered (see Step 2.b).
|
||||
// Note that CLDR provides languageMatching data that defines close equivalence
|
||||
// classes for base languages, scripts and regions.
|
||||
//
|
||||
// Tie-breaking
|
||||
// If we get the same confidence level for two matches, we apply a sequence of
|
||||
// tie-breaking rules. The first that succeeds defines the result. The rules are
|
||||
// applied in the following order.
|
||||
// 1) Original language was defined and was identical.
|
||||
// 2) Original region was defined and was identical.
|
||||
// 3) Distance between two maximized regions was the smallest.
|
||||
// 4) Original script was defined and was identical.
|
||||
// 5) Distance from want tag to have tag using the parent relation [see Note 5.]
|
||||
// If there is still no winner after these rules are applied, the first match
|
||||
// found wins.
|
||||
//
|
||||
// Notes:
|
||||
// [2] In practice, as matching of Exact is done in a separate phase from
|
||||
// matching the other levels, we reuse the Exact level to mean MaxExact in
|
||||
// the second phase. As a consequence, we only need the levels defined by
|
||||
// the Confidence type. The MaxExact confidence level is mapped to High in
|
||||
// the public API.
|
||||
// [3] We do not differentiate between maximized script values that were derived
|
||||
// from suppressScript versus most likely tag data. We determined that in
|
||||
// ranking the two, one ranks just after the other. Moreover, the two cannot
|
||||
// occur concurrently. As a consequence, they are identical for practical
|
||||
// purposes.
|
||||
// [4] In case of deprecated, macro-equivalents and legacy mappings, we assign
|
||||
// the MaxExact level to allow iw vs he to still be a closer match than
|
||||
// en-AU vs en-US, for example.
|
||||
// [5] In CLDR a locale inherits fields that are unspecified for this locale
|
||||
// from its parent. Therefore, if a locale is a parent of another locale,
|
||||
// it is a strong measure for closeness, especially when no other tie
|
||||
// breaker rule applies. One could also argue it is inconsistent, for
|
||||
// example, when pt-AO matches pt (which CLDR equates with pt-BR), even
|
||||
// though its parent is pt-PT according to the inheritance rules.
|
||||
//
|
||||
// Implementation Details:
|
||||
// There are several performance considerations worth pointing out. Most notably,
|
||||
// we preprocess as much as possible (within reason) at the time of creation of a
|
||||
// matcher. This includes:
|
||||
// - creating a per-language map, which includes data for the raw base language
|
||||
// and its canonicalized variant (if applicable),
|
||||
// - expanding entries for the equivalence classes defined in CLDR's
|
||||
// languageMatch data.
|
||||
// The per-language map ensures that typically only a very small number of tags
|
||||
// need to be considered. The pre-expansion of canonicalized subtags and
|
||||
// equivalence classes reduces the amount of map lookups that need to be done at
|
||||
// runtime.
|
||||
|
||||
// matcher keeps a set of supported language tags, indexed by language.
|
||||
type matcher struct {
|
||||
default_ *haveTag
|
||||
supported []*haveTag
|
||||
index map[language.Language]*matchHeader
|
||||
passSettings bool
|
||||
preferSameScript bool
|
||||
}
|
||||
|
||||
// matchHeader has the lists of tags for exact matches and matches based on
|
||||
// maximized and canonicalized tags for a given language.
|
||||
type matchHeader struct {
|
||||
haveTags []*haveTag
|
||||
original bool
|
||||
}
|
||||
|
||||
// haveTag holds a supported Tag and its maximized script and region. The maximized
|
||||
// or canonicalized language is not stored as it is not needed during matching.
|
||||
type haveTag struct {
|
||||
tag language.Tag
|
||||
|
||||
// index of this tag in the original list of supported tags.
|
||||
index int
|
||||
|
||||
// conf is the maximum confidence that can result from matching this haveTag.
|
||||
// When conf < Exact this means it was inserted after applying a CLDR equivalence rule.
|
||||
conf Confidence
|
||||
|
||||
// Maximized region and script.
|
||||
maxRegion language.Region
|
||||
maxScript language.Script
|
||||
|
||||
// altScript may be checked as an alternative match to maxScript. If altScript
|
||||
// matches, the confidence level for this match is Low. Theoretically there
|
||||
// could be multiple alternative scripts. This does not occur in practice.
|
||||
altScript language.Script
|
||||
|
||||
// nextMax is the index of the next haveTag with the same maximized tags.
|
||||
nextMax uint16
|
||||
}
|
||||
|
||||
func makeHaveTag(tag language.Tag, index int) (haveTag, language.Language) {
|
||||
max := tag
|
||||
if tag.LangID != 0 || tag.RegionID != 0 || tag.ScriptID != 0 {
|
||||
max, _ = canonicalize(All, max)
|
||||
max, _ = max.Maximize()
|
||||
max.RemakeString()
|
||||
}
|
||||
return haveTag{tag, index, Exact, max.RegionID, max.ScriptID, altScript(max.LangID, max.ScriptID), 0}, max.LangID
|
||||
}
|
||||
|
||||
// altScript returns an alternative script that may match the given script with
|
||||
// a low confidence. At the moment, the langMatch data allows for at most one
|
||||
// script to map to another and we rely on this to keep the code simple.
|
||||
func altScript(l language.Language, s language.Script) language.Script {
|
||||
for _, alt := range matchScript {
|
||||
// TODO: also match cases where language is not the same.
|
||||
if (language.Language(alt.wantLang) == l || language.Language(alt.haveLang) == l) &&
|
||||
language.Script(alt.haveScript) == s {
|
||||
return language.Script(alt.wantScript)
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// addIfNew adds a haveTag to the list of tags only if it is a unique tag.
|
||||
// Tags that have the same maximized values are linked by index.
|
||||
func (h *matchHeader) addIfNew(n haveTag, exact bool) {
|
||||
h.original = h.original || exact
|
||||
// Don't add new exact matches.
|
||||
for _, v := range h.haveTags {
|
||||
if equalsRest(v.tag, n.tag) {
|
||||
return
|
||||
}
|
||||
}
|
||||
// Allow duplicate maximized tags, but create a linked list to allow quickly
|
||||
// comparing the equivalents and bail out.
|
||||
for i, v := range h.haveTags {
|
||||
if v.maxScript == n.maxScript &&
|
||||
v.maxRegion == n.maxRegion &&
|
||||
v.tag.VariantOrPrivateUseTags() == n.tag.VariantOrPrivateUseTags() {
|
||||
for h.haveTags[i].nextMax != 0 {
|
||||
i = int(h.haveTags[i].nextMax)
|
||||
}
|
||||
h.haveTags[i].nextMax = uint16(len(h.haveTags))
|
||||
break
|
||||
}
|
||||
}
|
||||
h.haveTags = append(h.haveTags, &n)
|
||||
}
|
||||
|
||||
// header returns the matchHeader for the given language. It creates one if
|
||||
// it doesn't already exist.
|
||||
func (m *matcher) header(l language.Language) *matchHeader {
|
||||
if h := m.index[l]; h != nil {
|
||||
return h
|
||||
}
|
||||
h := &matchHeader{}
|
||||
m.index[l] = h
|
||||
return h
|
||||
}
|
||||
|
||||
func toConf(d uint8) Confidence {
|
||||
if d <= 10 {
|
||||
return High
|
||||
}
|
||||
if d < 30 {
|
||||
return Low
|
||||
}
|
||||
return No
|
||||
}
|
||||
|
||||
// newMatcher builds an index for the given supported tags and returns it as
|
||||
// a matcher. It also expands the index by considering various equivalence classes
|
||||
// for a given tag.
|
||||
func newMatcher(supported []Tag, options []MatchOption) *matcher {
|
||||
m := &matcher{
|
||||
index: make(map[language.Language]*matchHeader),
|
||||
preferSameScript: true,
|
||||
}
|
||||
for _, o := range options {
|
||||
o(m)
|
||||
}
|
||||
if len(supported) == 0 {
|
||||
m.default_ = &haveTag{}
|
||||
return m
|
||||
}
|
||||
// Add supported languages to the index. Add exact matches first to give
|
||||
// them precedence.
|
||||
for i, tag := range supported {
|
||||
tt := tag.tag()
|
||||
pair, _ := makeHaveTag(tt, i)
|
||||
m.header(tt.LangID).addIfNew(pair, true)
|
||||
m.supported = append(m.supported, &pair)
|
||||
}
|
||||
m.default_ = m.header(supported[0].lang()).haveTags[0]
|
||||
// Keep these in two different loops to support the case that two equivalent
|
||||
// languages are distinguished, such as iw and he.
|
||||
for i, tag := range supported {
|
||||
tt := tag.tag()
|
||||
pair, max := makeHaveTag(tt, i)
|
||||
if max != tt.LangID {
|
||||
m.header(max).addIfNew(pair, true)
|
||||
}
|
||||
}
|
||||
|
||||
// update is used to add indexes in the map for equivalent languages.
|
||||
// update will only add entries to original indexes, thus not computing any
|
||||
// transitive relations.
|
||||
update := func(want, have uint16, conf Confidence) {
|
||||
if hh := m.index[language.Language(have)]; hh != nil {
|
||||
if !hh.original {
|
||||
return
|
||||
}
|
||||
hw := m.header(language.Language(want))
|
||||
for _, ht := range hh.haveTags {
|
||||
v := *ht
|
||||
if conf < v.conf {
|
||||
v.conf = conf
|
||||
}
|
||||
v.nextMax = 0 // this value needs to be recomputed
|
||||
if v.altScript != 0 {
|
||||
v.altScript = altScript(language.Language(want), v.maxScript)
|
||||
}
|
||||
hw.addIfNew(v, conf == Exact && hh.original)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add entries for languages with mutual intelligibility as defined by CLDR's
|
||||
// languageMatch data.
|
||||
for _, ml := range matchLang {
|
||||
update(ml.want, ml.have, toConf(ml.distance))
|
||||
if !ml.oneway {
|
||||
update(ml.have, ml.want, toConf(ml.distance))
|
||||
}
|
||||
}
|
||||
|
||||
// Add entries for possible canonicalizations. This is an optimization to
|
||||
// ensure that only one map lookup needs to be done at runtime per desired tag.
|
||||
// First we match deprecated equivalents. If they are perfect equivalents
|
||||
// (their canonicalization simply substitutes a different language code, but
|
||||
// nothing else), the match confidence is Exact, otherwise it is High.
|
||||
for i, lm := range language.AliasMap {
|
||||
// If deprecated codes match and there is no fiddling with the script
|
||||
// or region, we consider it an exact match.
|
||||
conf := Exact
|
||||
if language.AliasTypes[i] != language.Macro {
|
||||
if !isExactEquivalent(language.Language(lm.From)) {
|
||||
conf = High
|
||||
}
|
||||
update(lm.To, lm.From, conf)
|
||||
}
|
||||
update(lm.From, lm.To, conf)
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// getBest gets the best matching tag in m for any of the given tags, taking into
|
||||
// account the order of preference of the given tags.
|
||||
func (m *matcher) getBest(want ...Tag) (got *haveTag, orig language.Tag, c Confidence) {
|
||||
best := bestMatch{}
|
||||
for i, ww := range want {
|
||||
w := ww.tag()
|
||||
var max language.Tag
|
||||
// Check for exact match first.
|
||||
h := m.index[w.LangID]
|
||||
if w.LangID != 0 {
|
||||
if h == nil {
|
||||
continue
|
||||
}
|
||||
// Base language is defined.
|
||||
max, _ = canonicalize(Legacy|Deprecated|Macro, w)
|
||||
// A region that is added through canonicalization is stronger than
|
||||
// a maximized region: set it in the original (e.g. mo -> ro-MD).
|
||||
if w.RegionID != max.RegionID {
|
||||
w.RegionID = max.RegionID
|
||||
}
|
||||
// TODO: should we do the same for scripts?
|
||||
// See test case: en, sr, nl ; sh ; sr
|
||||
max, _ = max.Maximize()
|
||||
} else {
|
||||
// Base language is not defined.
|
||||
if h != nil {
|
||||
for i := range h.haveTags {
|
||||
have := h.haveTags[i]
|
||||
if equalsRest(have.tag, w) {
|
||||
return have, w, Exact
|
||||
}
|
||||
}
|
||||
}
|
||||
if w.ScriptID == 0 && w.RegionID == 0 {
|
||||
// We skip all tags matching und for approximate matching, including
|
||||
// private tags.
|
||||
continue
|
||||
}
|
||||
max, _ = w.Maximize()
|
||||
if h = m.index[max.LangID]; h == nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
pin := true
|
||||
for _, t := range want[i+1:] {
|
||||
if w.LangID == t.lang() {
|
||||
pin = false
|
||||
break
|
||||
}
|
||||
}
|
||||
// Check for match based on maximized tag.
|
||||
for i := range h.haveTags {
|
||||
have := h.haveTags[i]
|
||||
best.update(have, w, max.ScriptID, max.RegionID, pin)
|
||||
if best.conf == Exact {
|
||||
for have.nextMax != 0 {
|
||||
have = h.haveTags[have.nextMax]
|
||||
best.update(have, w, max.ScriptID, max.RegionID, pin)
|
||||
}
|
||||
return best.have, best.want, best.conf
|
||||
}
|
||||
}
|
||||
}
|
||||
if best.conf <= No {
|
||||
if len(want) != 0 {
|
||||
return nil, want[0].tag(), No
|
||||
}
|
||||
return nil, language.Tag{}, No
|
||||
}
|
||||
return best.have, best.want, best.conf
|
||||
}
|
||||
|
||||
// bestMatch accumulates the best match so far.
|
||||
type bestMatch struct {
|
||||
have *haveTag
|
||||
want language.Tag
|
||||
conf Confidence
|
||||
pinnedRegion language.Region
|
||||
pinLanguage bool
|
||||
sameRegionGroup bool
|
||||
// Cached results from applying tie-breaking rules.
|
||||
origLang bool
|
||||
origReg bool
|
||||
paradigmReg bool
|
||||
regGroupDist uint8
|
||||
origScript bool
|
||||
}
|
||||
|
||||
// update updates the existing best match if the new pair is considered to be a
|
||||
// better match. To determine if the given pair is a better match, it first
|
||||
// computes the rough confidence level. If this surpasses the current match, it
|
||||
// will replace it and update the tie-breaker rule cache. If there is a tie, it
|
||||
// proceeds with applying a series of tie-breaker rules. If there is no
|
||||
// conclusive winner after applying the tie-breaker rules, it leaves the current
|
||||
// match as the preferred match.
|
||||
//
|
||||
// If pin is true and have and tag are a strong match, it will henceforth only
|
||||
// consider matches for this language. This corresponds to the idea that most
|
||||
// users have a strong preference for the first defined language. A user can
|
||||
// still prefer a second language over a dialect of the preferred language by
|
||||
// explicitly specifying dialects, e.g. "en, nl, en-GB". In this case pin should
|
||||
// be false.
|
||||
func (m *bestMatch) update(have *haveTag, tag language.Tag, maxScript language.Script, maxRegion language.Region, pin bool) {
|
||||
// Bail if the maximum attainable confidence is below that of the current best match.
|
||||
c := have.conf
|
||||
if c < m.conf {
|
||||
return
|
||||
}
|
||||
// Don't change the language once we already have found an exact match.
|
||||
if m.pinLanguage && tag.LangID != m.want.LangID {
|
||||
return
|
||||
}
|
||||
// Pin the region group if we are comparing tags for the same language.
|
||||
if tag.LangID == m.want.LangID && m.sameRegionGroup {
|
||||
_, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.LangID)
|
||||
if !sameGroup {
|
||||
return
|
||||
}
|
||||
}
|
||||
if c == Exact && have.maxScript == maxScript {
|
||||
// If there is another language and then another entry of this language,
|
||||
// don't pin anything, otherwise pin the language.
|
||||
m.pinLanguage = pin
|
||||
}
|
||||
if equalsRest(have.tag, tag) {
|
||||
} else if have.maxScript != maxScript {
|
||||
// There is usually very little comprehension between different scripts.
|
||||
// In a few cases there may still be Low comprehension. This possibility
|
||||
// is pre-computed and stored in have.altScript.
|
||||
if Low < m.conf || have.altScript != maxScript {
|
||||
return
|
||||
}
|
||||
c = Low
|
||||
} else if have.maxRegion != maxRegion {
|
||||
if High < c {
|
||||
// There is usually a small difference between languages across regions.
|
||||
c = High
|
||||
}
|
||||
}
|
||||
|
||||
// We store the results of the computations of the tie-breaker rules along
|
||||
// with the best match. There is no need to do the checks once we determine
|
||||
// we have a winner, but we do still need to do the tie-breaker computations.
|
||||
// We use "beaten" to keep track if we still need to do the checks.
|
||||
beaten := false // true if the new pair defeats the current one.
|
||||
if c != m.conf {
|
||||
if c < m.conf {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Tie-breaker rules:
|
||||
// We prefer if the pre-maximized language was specified and identical.
|
||||
origLang := have.tag.LangID == tag.LangID && tag.LangID != 0
|
||||
if !beaten && m.origLang != origLang {
|
||||
if m.origLang {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// We prefer if the pre-maximized region was specified and identical.
|
||||
origReg := have.tag.RegionID == tag.RegionID && tag.RegionID != 0
|
||||
if !beaten && m.origReg != origReg {
|
||||
if m.origReg {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.LangID)
|
||||
if !beaten && m.regGroupDist != regGroupDist {
|
||||
if regGroupDist > m.regGroupDist {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
paradigmReg := isParadigmLocale(tag.LangID, have.maxRegion)
|
||||
if !beaten && m.paradigmReg != paradigmReg {
|
||||
if !paradigmReg {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Next we prefer if the pre-maximized script was specified and identical.
|
||||
origScript := have.tag.ScriptID == tag.ScriptID && tag.ScriptID != 0
|
||||
if !beaten && m.origScript != origScript {
|
||||
if m.origScript {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Update m to the newly found best match.
|
||||
if beaten {
|
||||
m.have = have
|
||||
m.want = tag
|
||||
m.conf = c
|
||||
m.pinnedRegion = maxRegion
|
||||
m.sameRegionGroup = sameGroup
|
||||
m.origLang = origLang
|
||||
m.origReg = origReg
|
||||
m.paradigmReg = paradigmReg
|
||||
m.origScript = origScript
|
||||
m.regGroupDist = regGroupDist
|
||||
}
|
||||
}
|
||||
|
||||
func isParadigmLocale(lang language.Language, r language.Region) bool {
|
||||
for _, e := range paradigmLocales {
|
||||
if language.Language(e[0]) == lang && (r == language.Region(e[1]) || r == language.Region(e[2])) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// regionGroupDist computes the distance between two regions based on their
|
||||
// CLDR grouping.
|
||||
func regionGroupDist(a, b language.Region, script language.Script, lang language.Language) (dist uint8, same bool) {
|
||||
const defaultDistance = 4
|
||||
|
||||
aGroup := uint(regionToGroups[a]) << 1
|
||||
bGroup := uint(regionToGroups[b]) << 1
|
||||
for _, ri := range matchRegion {
|
||||
if language.Language(ri.lang) == lang && (ri.script == 0 || language.Script(ri.script) == script) {
|
||||
group := uint(1 << (ri.group &^ 0x80))
|
||||
if 0x80&ri.group == 0 {
|
||||
if aGroup&bGroup&group != 0 { // Both regions are in the group.
|
||||
return ri.distance, ri.distance == defaultDistance
|
||||
}
|
||||
} else {
|
||||
if (aGroup|bGroup)&group == 0 { // Both regions are not in the group.
|
||||
return ri.distance, ri.distance == defaultDistance
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return defaultDistance, true
|
||||
}
|
||||
|
||||
// equalsRest compares everything except the language.
|
||||
func equalsRest(a, b language.Tag) bool {
|
||||
// TODO: don't include extensions in this comparison. To do this efficiently,
|
||||
// though, we should handle private tags separately.
|
||||
return a.ScriptID == b.ScriptID && a.RegionID == b.RegionID && a.VariantOrPrivateUseTags() == b.VariantOrPrivateUseTags()
|
||||
}
|
||||
|
||||
// isExactEquivalent returns true if canonicalizing the language will not alter
|
||||
// the script or region of a tag.
|
||||
func isExactEquivalent(l language.Language) bool {
|
||||
for _, o := range notEquivalent {
|
||||
if o == l {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
var notEquivalent []language.Language
|
||||
|
||||
func init() {
|
||||
// Create a list of all languages for which canonicalization may alter the
|
||||
// script or region.
|
||||
for _, lm := range language.AliasMap {
|
||||
tag := language.Tag{LangID: language.Language(lm.From)}
|
||||
if tag, _ = canonicalize(All, tag); tag.ScriptID != 0 || tag.RegionID != 0 {
|
||||
notEquivalent = append(notEquivalent, language.Language(lm.From))
|
||||
}
|
||||
}
|
||||
// Maximize undefined regions of paradigm locales.
|
||||
for i, v := range paradigmLocales {
|
||||
t := language.Tag{LangID: language.Language(v[0])}
|
||||
max, _ := t.Maximize()
|
||||
if v[1] == 0 {
|
||||
paradigmLocales[i][1] = uint16(max.RegionID)
|
||||
}
|
||||
if v[2] == 0 {
|
||||
paradigmLocales[i][2] = uint16(max.RegionID)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,256 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/language"
|
||||
)
|
||||
|
||||
// ValueError is returned by any of the parsing functions when the
|
||||
// input is well-formed but the respective subtag is not recognized
|
||||
// as a valid value.
|
||||
type ValueError interface {
|
||||
error
|
||||
|
||||
// Subtag returns the subtag for which the error occurred.
|
||||
Subtag() string
|
||||
}
|
||||
|
||||
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
|
||||
// failed it returns an error and any part of the tag that could be parsed.
|
||||
// If parsing succeeded but an unknown value was found, it returns
|
||||
// ValueError. The Tag returned in this case is just stripped of the unknown
|
||||
// value. All other values are preserved. It accepts tags in the BCP 47 format
|
||||
// and extensions to this standard defined in
|
||||
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// The resulting tag is canonicalized using the default canonicalization type.
|
||||
func Parse(s string) (t Tag, err error) {
|
||||
return Default.Parse(s)
|
||||
}
|
||||
|
||||
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
|
||||
// failed it returns an error and any part of the tag that could be parsed.
|
||||
// If parsing succeeded but an unknown value was found, it returns
|
||||
// ValueError. The Tag returned in this case is just stripped of the unknown
|
||||
// value. All other values are preserved. It accepts tags in the BCP 47 format
|
||||
// and extensions to this standard defined in
|
||||
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// The resulting tag is canonicalized using the canonicalization type c.
|
||||
func (c CanonType) Parse(s string) (t Tag, err error) {
|
||||
defer func() {
|
||||
if recover() != nil {
|
||||
t = Tag{}
|
||||
err = language.ErrSyntax
|
||||
}
|
||||
}()
|
||||
|
||||
tt, err := language.Parse(s)
|
||||
if err != nil {
|
||||
return makeTag(tt), err
|
||||
}
|
||||
tt, changed := canonicalize(c, tt)
|
||||
if changed {
|
||||
tt.RemakeString()
|
||||
}
|
||||
return makeTag(tt), err
|
||||
}
|
||||
|
||||
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
|
||||
// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
|
||||
// Base, Script or Region or slice of type Variant or Extension is passed more
|
||||
// than once, the latter will overwrite the former. Variants and Extensions are
|
||||
// accumulated, but if two extensions of the same type are passed, the latter
|
||||
// will replace the former. For -u extensions, though, the key-type pairs are
|
||||
// added, where later values overwrite older ones. A Tag overwrites all former
|
||||
// values and typically only makes sense as the first argument. The resulting
|
||||
// tag is returned after canonicalizing using the Default CanonType. If one or
|
||||
// more errors are encountered, one of the errors is returned.
|
||||
func Compose(part ...interface{}) (t Tag, err error) {
|
||||
return Default.Compose(part...)
|
||||
}
|
||||
|
||||
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
|
||||
// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
|
||||
// Base, Script or Region or slice of type Variant or Extension is passed more
|
||||
// than once, the latter will overwrite the former. Variants and Extensions are
|
||||
// accumulated, but if two extensions of the same type are passed, the latter
|
||||
// will replace the former. For -u extensions, though, the key-type pairs are
|
||||
// added, where later values overwrite older ones. A Tag overwrites all former
|
||||
// values and typically only makes sense as the first argument. The resulting
|
||||
// tag is returned after canonicalizing using CanonType c. If one or more errors
|
||||
// are encountered, one of the errors is returned.
|
||||
func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
|
||||
defer func() {
|
||||
if recover() != nil {
|
||||
t = Tag{}
|
||||
err = language.ErrSyntax
|
||||
}
|
||||
}()
|
||||
|
||||
var b language.Builder
|
||||
if err = update(&b, part...); err != nil {
|
||||
return und, err
|
||||
}
|
||||
b.Tag, _ = canonicalize(c, b.Tag)
|
||||
return makeTag(b.Make()), err
|
||||
}
|
||||
|
||||
var errInvalidArgument = errors.New("invalid Extension or Variant")
|
||||
|
||||
func update(b *language.Builder, part ...interface{}) (err error) {
|
||||
for _, x := range part {
|
||||
switch v := x.(type) {
|
||||
case Tag:
|
||||
b.SetTag(v.tag())
|
||||
case Base:
|
||||
b.Tag.LangID = v.langID
|
||||
case Script:
|
||||
b.Tag.ScriptID = v.scriptID
|
||||
case Region:
|
||||
b.Tag.RegionID = v.regionID
|
||||
case Variant:
|
||||
if v.variant == "" {
|
||||
err = errInvalidArgument
|
||||
break
|
||||
}
|
||||
b.AddVariant(v.variant)
|
||||
case Extension:
|
||||
if v.s == "" {
|
||||
err = errInvalidArgument
|
||||
break
|
||||
}
|
||||
b.SetExt(v.s)
|
||||
case []Variant:
|
||||
b.ClearVariants()
|
||||
for _, v := range v {
|
||||
b.AddVariant(v.variant)
|
||||
}
|
||||
case []Extension:
|
||||
b.ClearExtensions()
|
||||
for _, e := range v {
|
||||
b.SetExt(e.s)
|
||||
}
|
||||
// TODO: support parsing of raw strings based on morphology or just extensions?
|
||||
case error:
|
||||
if v != nil {
|
||||
err = v
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
|
||||
var errTagListTooLarge = errors.New("tag list exceeds max length")
|
||||
|
||||
// ParseAcceptLanguage parses the contents of an Accept-Language header as
|
||||
// defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
|
||||
// a list of corresponding quality weights. It is more permissive than RFC 2616
|
||||
// and may return non-nil slices even if the input is not valid.
|
||||
// The Tags will be sorted by highest weight first and then by first occurrence.
|
||||
// Tags with a weight of zero will be dropped. An error will be returned if the
|
||||
// input could not be parsed.
|
||||
func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
|
||||
defer func() {
|
||||
if recover() != nil {
|
||||
tag = nil
|
||||
q = nil
|
||||
err = language.ErrSyntax
|
||||
}
|
||||
}()
|
||||
|
||||
if strings.Count(s, "-") > 1000 {
|
||||
return nil, nil, errTagListTooLarge
|
||||
}
|
||||
|
||||
var entry string
|
||||
for s != "" {
|
||||
if entry, s = split(s, ','); entry == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
entry, weight := split(entry, ';')
|
||||
|
||||
// Scan the language.
|
||||
t, err := Parse(entry)
|
||||
if err != nil {
|
||||
id, ok := acceptFallback[entry]
|
||||
if !ok {
|
||||
return nil, nil, err
|
||||
}
|
||||
t = makeTag(language.Tag{LangID: id})
|
||||
}
|
||||
|
||||
// Scan the optional weight.
|
||||
w := 1.0
|
||||
if weight != "" {
|
||||
weight = consume(weight, 'q')
|
||||
weight = consume(weight, '=')
|
||||
// consume returns the empty string when a token could not be
|
||||
// consumed, resulting in an error for ParseFloat.
|
||||
if w, err = strconv.ParseFloat(weight, 32); err != nil {
|
||||
return nil, nil, errInvalidWeight
|
||||
}
|
||||
// Drop tags with a quality weight of 0.
|
||||
if w <= 0 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
tag = append(tag, t)
|
||||
q = append(q, float32(w))
|
||||
}
|
||||
sort.Stable(&tagSort{tag, q})
|
||||
return tag, q, nil
|
||||
}
|
||||
|
||||
// consume removes a leading token c from s and returns the result or the empty
|
||||
// string if there is no such token.
|
||||
func consume(s string, c byte) string {
|
||||
if s == "" || s[0] != c {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(s[1:])
|
||||
}
|
||||
|
||||
func split(s string, c byte) (head, tail string) {
|
||||
if i := strings.IndexByte(s, c); i >= 0 {
|
||||
return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
|
||||
}
|
||||
return strings.TrimSpace(s), ""
|
||||
}
|
||||
|
||||
// Add hack mapping to deal with a small number of cases that occur
|
||||
// in Accept-Language (with reasonable frequency).
|
||||
var acceptFallback = map[string]language.Language{
|
||||
"english": _en,
|
||||
"deutsch": _de,
|
||||
"italian": _it,
|
||||
"french": _fr,
|
||||
"*": _mul, // defined in the spec to match all languages.
|
||||
}
|
||||
|
||||
type tagSort struct {
|
||||
tag []Tag
|
||||
q []float32
|
||||
}
|
||||
|
||||
func (s *tagSort) Len() int {
|
||||
return len(s.q)
|
||||
}
|
||||
|
||||
func (s *tagSort) Less(i, j int) bool {
|
||||
return s.q[i] > s.q[j]
|
||||
}
|
||||
|
||||
func (s *tagSort) Swap(i, j int) {
|
||||
s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
|
||||
s.q[i], s.q[j] = s.q[j], s.q[i]
|
||||
}
|
|
@ -0,0 +1,298 @@
|
|||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||
|
||||
package language
|
||||
|
||||
// CLDRVersion is the CLDR version from which the tables in this package are derived.
|
||||
const CLDRVersion = "32"
|
||||
|
||||
const (
|
||||
_de = 269
|
||||
_en = 313
|
||||
_fr = 350
|
||||
_it = 505
|
||||
_mo = 784
|
||||
_no = 879
|
||||
_nb = 839
|
||||
_pt = 960
|
||||
_sh = 1031
|
||||
_mul = 806
|
||||
_und = 0
|
||||
)
|
||||
const (
|
||||
_001 = 1
|
||||
_419 = 31
|
||||
_BR = 65
|
||||
_CA = 73
|
||||
_ES = 111
|
||||
_GB = 124
|
||||
_MD = 189
|
||||
_PT = 239
|
||||
_UK = 307
|
||||
_US = 310
|
||||
_ZZ = 358
|
||||
_XA = 324
|
||||
_XC = 326
|
||||
_XK = 334
|
||||
)
|
||||
const (
|
||||
_Latn = 91
|
||||
_Hani = 57
|
||||
_Hans = 59
|
||||
_Hant = 60
|
||||
_Qaaa = 149
|
||||
_Qaai = 157
|
||||
_Qabx = 198
|
||||
_Zinh = 255
|
||||
_Zyyy = 260
|
||||
_Zzzz = 261
|
||||
)
|
||||
|
||||
var regionToGroups = []uint8{ // 359 elements
|
||||
// Entry 0 - 3F
|
||||
0x00, 0x00, 0x00, 0x04, 0x04, 0x00, 0x00, 0x04,
|
||||
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x00,
|
||||
0x00, 0x04, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00,
|
||||
0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x00, 0x04,
|
||||
// Entry 40 - 7F
|
||||
0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x04,
|
||||
0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x00,
|
||||
0x08, 0x00, 0x04, 0x00, 0x00, 0x08, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04,
|
||||
// Entry 80 - BF
|
||||
0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x04, 0x00,
|
||||
0x00, 0x00, 0x04, 0x01, 0x00, 0x04, 0x02, 0x00,
|
||||
0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00,
|
||||
0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x08, 0x08, 0x00, 0x00, 0x00, 0x04,
|
||||
// Entry C0 - FF
|
||||
0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
|
||||
0x01, 0x04, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00,
|
||||
0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x04, 0x00, 0x05, 0x00, 0x00,
|
||||
0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// Entry 100 - 13F
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
|
||||
0x00, 0x00, 0x00, 0x04, 0x04, 0x00, 0x00, 0x00,
|
||||
0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05, 0x04,
|
||||
0x00, 0x00, 0x04, 0x00, 0x04, 0x04, 0x05, 0x00,
|
||||
// Entry 140 - 17F
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
} // Size: 383 bytes
|
||||
|
||||
var paradigmLocales = [][3]uint16{ // 3 elements
|
||||
0: [3]uint16{0x139, 0x0, 0x7c},
|
||||
1: [3]uint16{0x13e, 0x0, 0x1f},
|
||||
2: [3]uint16{0x3c0, 0x41, 0xef},
|
||||
} // Size: 42 bytes
|
||||
|
||||
type mutualIntelligibility struct {
|
||||
want uint16
|
||||
have uint16
|
||||
distance uint8
|
||||
oneway bool
|
||||
}
|
||||
type scriptIntelligibility struct {
|
||||
wantLang uint16
|
||||
haveLang uint16
|
||||
wantScript uint8
|
||||
haveScript uint8
|
||||
distance uint8
|
||||
}
|
||||
type regionIntelligibility struct {
|
||||
lang uint16
|
||||
script uint8
|
||||
group uint8
|
||||
distance uint8
|
||||
}
|
||||
|
||||
// matchLang holds pairs of langIDs of base languages that are typically
|
||||
// mutually intelligible. Each pair is associated with a confidence and
|
||||
// whether the intelligibility goes one or both ways.
|
||||
var matchLang = []mutualIntelligibility{ // 113 elements
|
||||
0: {want: 0x1d1, have: 0xb7, distance: 0x4, oneway: false},
|
||||
1: {want: 0x407, have: 0xb7, distance: 0x4, oneway: false},
|
||||
2: {want: 0x407, have: 0x1d1, distance: 0x4, oneway: false},
|
||||
3: {want: 0x407, have: 0x432, distance: 0x4, oneway: false},
|
||||
4: {want: 0x43a, have: 0x1, distance: 0x4, oneway: false},
|
||||
5: {want: 0x1a3, have: 0x10d, distance: 0x4, oneway: true},
|
||||
6: {want: 0x295, have: 0x10d, distance: 0x4, oneway: true},
|
||||
7: {want: 0x101, have: 0x36f, distance: 0x8, oneway: false},
|
||||
8: {want: 0x101, have: 0x347, distance: 0x8, oneway: false},
|
||||
9: {want: 0x5, have: 0x3e2, distance: 0xa, oneway: true},
|
||||
10: {want: 0xd, have: 0x139, distance: 0xa, oneway: true},
|
||||
11: {want: 0x16, have: 0x367, distance: 0xa, oneway: true},
|
||||
12: {want: 0x21, have: 0x139, distance: 0xa, oneway: true},
|
||||
13: {want: 0x56, have: 0x13e, distance: 0xa, oneway: true},
|
||||
14: {want: 0x58, have: 0x3e2, distance: 0xa, oneway: true},
|
||||
15: {want: 0x71, have: 0x3e2, distance: 0xa, oneway: true},
|
||||
16: {want: 0x75, have: 0x139, distance: 0xa, oneway: true},
|
||||
17: {want: 0x82, have: 0x1be, distance: 0xa, oneway: true},
|
||||
18: {want: 0xa5, have: 0x139, distance: 0xa, oneway: true},
|
||||
19: {want: 0xb2, have: 0x15e, distance: 0xa, oneway: true},
|
||||
20: {want: 0xdd, have: 0x153, distance: 0xa, oneway: true},
|
||||
21: {want: 0xe5, have: 0x139, distance: 0xa, oneway: true},
|
||||
22: {want: 0xe9, have: 0x3a, distance: 0xa, oneway: true},
|
||||
23: {want: 0xf0, have: 0x15e, distance: 0xa, oneway: true},
|
||||
24: {want: 0xf9, have: 0x15e, distance: 0xa, oneway: true},
|
||||
25: {want: 0x100, have: 0x139, distance: 0xa, oneway: true},
|
||||
26: {want: 0x130, have: 0x139, distance: 0xa, oneway: true},
|
||||
27: {want: 0x13c, have: 0x139, distance: 0xa, oneway: true},
|
||||
28: {want: 0x140, have: 0x151, distance: 0xa, oneway: true},
|
||||
29: {want: 0x145, have: 0x13e, distance: 0xa, oneway: true},
|
||||
30: {want: 0x158, have: 0x101, distance: 0xa, oneway: true},
|
||||
31: {want: 0x16d, have: 0x367, distance: 0xa, oneway: true},
|
||||
32: {want: 0x16e, have: 0x139, distance: 0xa, oneway: true},
|
||||
33: {want: 0x16f, have: 0x139, distance: 0xa, oneway: true},
|
||||
34: {want: 0x17e, have: 0x139, distance: 0xa, oneway: true},
|
||||
35: {want: 0x190, have: 0x13e, distance: 0xa, oneway: true},
|
||||
36: {want: 0x194, have: 0x13e, distance: 0xa, oneway: true},
|
||||
37: {want: 0x1a4, have: 0x1be, distance: 0xa, oneway: true},
|
||||
38: {want: 0x1b4, have: 0x139, distance: 0xa, oneway: true},
|
||||
39: {want: 0x1b8, have: 0x139, distance: 0xa, oneway: true},
|
||||
40: {want: 0x1d4, have: 0x15e, distance: 0xa, oneway: true},
|
||||
41: {want: 0x1d7, have: 0x3e2, distance: 0xa, oneway: true},
|
||||
42: {want: 0x1d9, have: 0x139, distance: 0xa, oneway: true},
|
||||
43: {want: 0x1e7, have: 0x139, distance: 0xa, oneway: true},
|
||||
44: {want: 0x1f8, have: 0x139, distance: 0xa, oneway: true},
|
||||
45: {want: 0x20e, have: 0x1e1, distance: 0xa, oneway: true},
|
||||
46: {want: 0x210, have: 0x139, distance: 0xa, oneway: true},
|
||||
47: {want: 0x22d, have: 0x15e, distance: 0xa, oneway: true},
|
||||
48: {want: 0x242, have: 0x3e2, distance: 0xa, oneway: true},
|
||||
49: {want: 0x24a, have: 0x139, distance: 0xa, oneway: true},
|
||||
50: {want: 0x251, have: 0x139, distance: 0xa, oneway: true},
|
||||
51: {want: 0x265, have: 0x139, distance: 0xa, oneway: true},
|
||||
52: {want: 0x274, have: 0x48a, distance: 0xa, oneway: true},
|
||||
53: {want: 0x28a, have: 0x3e2, distance: 0xa, oneway: true},
|
||||
54: {want: 0x28e, have: 0x1f9, distance: 0xa, oneway: true},
|
||||
55: {want: 0x2a3, have: 0x139, distance: 0xa, oneway: true},
|
||||
56: {want: 0x2b5, have: 0x15e, distance: 0xa, oneway: true},
|
||||
57: {want: 0x2b8, have: 0x139, distance: 0xa, oneway: true},
|
||||
58: {want: 0x2be, have: 0x139, distance: 0xa, oneway: true},
|
||||
59: {want: 0x2c3, have: 0x15e, distance: 0xa, oneway: true},
|
||||
60: {want: 0x2ed, have: 0x139, distance: 0xa, oneway: true},
|
||||
61: {want: 0x2f1, have: 0x15e, distance: 0xa, oneway: true},
|
||||
62: {want: 0x2fa, have: 0x139, distance: 0xa, oneway: true},
|
||||
63: {want: 0x2ff, have: 0x7e, distance: 0xa, oneway: true},
|
||||
64: {want: 0x304, have: 0x139, distance: 0xa, oneway: true},
|
||||
65: {want: 0x30b, have: 0x3e2, distance: 0xa, oneway: true},
|
||||
66: {want: 0x31b, have: 0x1be, distance: 0xa, oneway: true},
|
||||
67: {want: 0x31f, have: 0x1e1, distance: 0xa, oneway: true},
|
||||
68: {want: 0x320, have: 0x139, distance: 0xa, oneway: true},
|
||||
69: {want: 0x331, have: 0x139, distance: 0xa, oneway: true},
|
||||
70: {want: 0x351, have: 0x139, distance: 0xa, oneway: true},
|
||||
71: {want: 0x36a, have: 0x347, distance: 0xa, oneway: false},
|
||||
72: {want: 0x36a, have: 0x36f, distance: 0xa, oneway: true},
|
||||
73: {want: 0x37a, have: 0x139, distance: 0xa, oneway: true},
|
||||
74: {want: 0x387, have: 0x139, distance: 0xa, oneway: true},
|
||||
75: {want: 0x389, have: 0x139, distance: 0xa, oneway: true},
|
||||
76: {want: 0x38b, have: 0x15e, distance: 0xa, oneway: true},
|
||||
77: {want: 0x390, have: 0x139, distance: 0xa, oneway: true},
|
||||
78: {want: 0x395, have: 0x139, distance: 0xa, oneway: true},
|
||||
79: {want: 0x39d, have: 0x139, distance: 0xa, oneway: true},
|
||||
80: {want: 0x3a5, have: 0x139, distance: 0xa, oneway: true},
|
||||
81: {want: 0x3be, have: 0x139, distance: 0xa, oneway: true},
|
||||
82: {want: 0x3c4, have: 0x13e, distance: 0xa, oneway: true},
|
||||
83: {want: 0x3d4, have: 0x10d, distance: 0xa, oneway: true},
|
||||
84: {want: 0x3d9, have: 0x139, distance: 0xa, oneway: true},
|
||||
85: {want: 0x3e5, have: 0x15e, distance: 0xa, oneway: true},
|
||||
86: {want: 0x3e9, have: 0x1be, distance: 0xa, oneway: true},
|
||||
87: {want: 0x3fa, have: 0x139, distance: 0xa, oneway: true},
|
||||
88: {want: 0x40c, have: 0x139, distance: 0xa, oneway: true},
|
||||
89: {want: 0x423, have: 0x139, distance: 0xa, oneway: true},
|
||||
90: {want: 0x429, have: 0x139, distance: 0xa, oneway: true},
|
||||
91: {want: 0x431, have: 0x139, distance: 0xa, oneway: true},
|
||||
92: {want: 0x43b, have: 0x139, distance: 0xa, oneway: true},
|
||||
93: {want: 0x43e, have: 0x1e1, distance: 0xa, oneway: true},
|
||||
94: {want: 0x445, have: 0x139, distance: 0xa, oneway: true},
|
||||
95: {want: 0x450, have: 0x139, distance: 0xa, oneway: true},
|
||||
96: {want: 0x461, have: 0x139, distance: 0xa, oneway: true},
|
||||
97: {want: 0x467, have: 0x3e2, distance: 0xa, oneway: true},
|
||||
98: {want: 0x46f, have: 0x139, distance: 0xa, oneway: true},
|
||||
99: {want: 0x476, have: 0x3e2, distance: 0xa, oneway: true},
|
||||
100: {want: 0x3883, have: 0x139, distance: 0xa, oneway: true},
|
||||
101: {want: 0x480, have: 0x139, distance: 0xa, oneway: true},
|
||||
102: {want: 0x482, have: 0x139, distance: 0xa, oneway: true},
|
||||
103: {want: 0x494, have: 0x3e2, distance: 0xa, oneway: true},
|
||||
104: {want: 0x49d, have: 0x139, distance: 0xa, oneway: true},
|
||||
105: {want: 0x4ac, have: 0x529, distance: 0xa, oneway: true},
|
||||
106: {want: 0x4b4, have: 0x139, distance: 0xa, oneway: true},
|
||||
107: {want: 0x4bc, have: 0x3e2, distance: 0xa, oneway: true},
|
||||
108: {want: 0x4e5, have: 0x15e, distance: 0xa, oneway: true},
|
||||
109: {want: 0x4f2, have: 0x139, distance: 0xa, oneway: true},
|
||||
110: {want: 0x512, have: 0x139, distance: 0xa, oneway: true},
|
||||
111: {want: 0x518, have: 0x139, distance: 0xa, oneway: true},
|
||||
112: {want: 0x52f, have: 0x139, distance: 0xa, oneway: true},
|
||||
} // Size: 702 bytes
|
||||
|
||||
// matchScript holds pairs of scriptIDs where readers of one script
|
||||
// can typically also read the other. Each is associated with a confidence.
|
||||
var matchScript = []scriptIntelligibility{ // 26 elements
|
||||
0: {wantLang: 0x432, haveLang: 0x432, wantScript: 0x5b, haveScript: 0x20, distance: 0x5},
|
||||
1: {wantLang: 0x432, haveLang: 0x432, wantScript: 0x20, haveScript: 0x5b, distance: 0x5},
|
||||
2: {wantLang: 0x58, haveLang: 0x3e2, wantScript: 0x5b, haveScript: 0x20, distance: 0xa},
|
||||
3: {wantLang: 0xa5, haveLang: 0x139, wantScript: 0xe, haveScript: 0x5b, distance: 0xa},
|
||||
4: {wantLang: 0x1d7, haveLang: 0x3e2, wantScript: 0x8, haveScript: 0x20, distance: 0xa},
|
||||
5: {wantLang: 0x210, haveLang: 0x139, wantScript: 0x2e, haveScript: 0x5b, distance: 0xa},
|
||||
6: {wantLang: 0x24a, haveLang: 0x139, wantScript: 0x4f, haveScript: 0x5b, distance: 0xa},
|
||||
7: {wantLang: 0x251, haveLang: 0x139, wantScript: 0x53, haveScript: 0x5b, distance: 0xa},
|
||||
8: {wantLang: 0x2b8, haveLang: 0x139, wantScript: 0x58, haveScript: 0x5b, distance: 0xa},
|
||||
9: {wantLang: 0x304, haveLang: 0x139, wantScript: 0x6f, haveScript: 0x5b, distance: 0xa},
|
||||
10: {wantLang: 0x331, haveLang: 0x139, wantScript: 0x76, haveScript: 0x5b, distance: 0xa},
|
||||
11: {wantLang: 0x351, haveLang: 0x139, wantScript: 0x22, haveScript: 0x5b, distance: 0xa},
|
||||
12: {wantLang: 0x395, haveLang: 0x139, wantScript: 0x83, haveScript: 0x5b, distance: 0xa},
|
||||
13: {wantLang: 0x39d, haveLang: 0x139, wantScript: 0x36, haveScript: 0x5b, distance: 0xa},
|
||||
14: {wantLang: 0x3be, haveLang: 0x139, wantScript: 0x5, haveScript: 0x5b, distance: 0xa},
|
||||
15: {wantLang: 0x3fa, haveLang: 0x139, wantScript: 0x5, haveScript: 0x5b, distance: 0xa},
|
||||
16: {wantLang: 0x40c, haveLang: 0x139, wantScript: 0xd6, haveScript: 0x5b, distance: 0xa},
|
||||
17: {wantLang: 0x450, haveLang: 0x139, wantScript: 0xe6, haveScript: 0x5b, distance: 0xa},
|
||||
18: {wantLang: 0x461, haveLang: 0x139, wantScript: 0xe9, haveScript: 0x5b, distance: 0xa},
|
||||
19: {wantLang: 0x46f, haveLang: 0x139, wantScript: 0x2c, haveScript: 0x5b, distance: 0xa},
|
||||
20: {wantLang: 0x476, haveLang: 0x3e2, wantScript: 0x5b, haveScript: 0x20, distance: 0xa},
|
||||
21: {wantLang: 0x4b4, haveLang: 0x139, wantScript: 0x5, haveScript: 0x5b, distance: 0xa},
|
||||
22: {wantLang: 0x4bc, haveLang: 0x3e2, wantScript: 0x5b, haveScript: 0x20, distance: 0xa},
|
||||
23: {wantLang: 0x512, haveLang: 0x139, wantScript: 0x3e, haveScript: 0x5b, distance: 0xa},
|
||||
24: {wantLang: 0x529, haveLang: 0x529, wantScript: 0x3b, haveScript: 0x3c, distance: 0xf},
|
||||
25: {wantLang: 0x529, haveLang: 0x529, wantScript: 0x3c, haveScript: 0x3b, distance: 0x13},
|
||||
} // Size: 232 bytes
|
||||
|
||||
var matchRegion = []regionIntelligibility{ // 15 elements
|
||||
0: {lang: 0x3a, script: 0x0, group: 0x4, distance: 0x4},
|
||||
1: {lang: 0x3a, script: 0x0, group: 0x84, distance: 0x4},
|
||||
2: {lang: 0x139, script: 0x0, group: 0x1, distance: 0x4},
|
||||
3: {lang: 0x139, script: 0x0, group: 0x81, distance: 0x4},
|
||||
4: {lang: 0x13e, script: 0x0, group: 0x3, distance: 0x4},
|
||||
5: {lang: 0x13e, script: 0x0, group: 0x83, distance: 0x4},
|
||||
6: {lang: 0x3c0, script: 0x0, group: 0x3, distance: 0x4},
|
||||
7: {lang: 0x3c0, script: 0x0, group: 0x83, distance: 0x4},
|
||||
8: {lang: 0x529, script: 0x3c, group: 0x2, distance: 0x4},
|
||||
9: {lang: 0x529, script: 0x3c, group: 0x82, distance: 0x4},
|
||||
10: {lang: 0x3a, script: 0x0, group: 0x80, distance: 0x5},
|
||||
11: {lang: 0x139, script: 0x0, group: 0x80, distance: 0x5},
|
||||
12: {lang: 0x13e, script: 0x0, group: 0x80, distance: 0x5},
|
||||
13: {lang: 0x3c0, script: 0x0, group: 0x80, distance: 0x5},
|
||||
14: {lang: 0x529, script: 0x3c, group: 0x80, distance: 0x5},
|
||||
} // Size: 114 bytes
|
||||
|
||||
// Total table size 1473 bytes (1KiB); checksum: 7BB90B5C
|
|
@ -0,0 +1,145 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import "golang.org/x/text/internal/language/compact"
|
||||
|
||||
// TODO: Various sets of commonly use tags and regions.
|
||||
|
||||
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
|
||||
// It simplifies safe initialization of Tag values.
|
||||
func MustParse(s string) Tag {
|
||||
t, err := Parse(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
|
||||
// It simplifies safe initialization of Tag values.
|
||||
func (c CanonType) MustParse(s string) Tag {
|
||||
t, err := c.Parse(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
// MustParseBase is like ParseBase, but panics if the given base cannot be parsed.
|
||||
// It simplifies safe initialization of Base values.
|
||||
func MustParseBase(s string) Base {
|
||||
b, err := ParseBase(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// MustParseScript is like ParseScript, but panics if the given script cannot be
|
||||
// parsed. It simplifies safe initialization of Script values.
|
||||
func MustParseScript(s string) Script {
|
||||
scr, err := ParseScript(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return scr
|
||||
}
|
||||
|
||||
// MustParseRegion is like ParseRegion, but panics if the given region cannot be
|
||||
// parsed. It simplifies safe initialization of Region values.
|
||||
func MustParseRegion(s string) Region {
|
||||
r, err := ParseRegion(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
var (
|
||||
und = Tag{}
|
||||
|
||||
Und Tag = Tag{}
|
||||
|
||||
Afrikaans Tag = Tag(compact.Afrikaans)
|
||||
Amharic Tag = Tag(compact.Amharic)
|
||||
Arabic Tag = Tag(compact.Arabic)
|
||||
ModernStandardArabic Tag = Tag(compact.ModernStandardArabic)
|
||||
Azerbaijani Tag = Tag(compact.Azerbaijani)
|
||||
Bulgarian Tag = Tag(compact.Bulgarian)
|
||||
Bengali Tag = Tag(compact.Bengali)
|
||||
Catalan Tag = Tag(compact.Catalan)
|
||||
Czech Tag = Tag(compact.Czech)
|
||||
Danish Tag = Tag(compact.Danish)
|
||||
German Tag = Tag(compact.German)
|
||||
Greek Tag = Tag(compact.Greek)
|
||||
English Tag = Tag(compact.English)
|
||||
AmericanEnglish Tag = Tag(compact.AmericanEnglish)
|
||||
BritishEnglish Tag = Tag(compact.BritishEnglish)
|
||||
Spanish Tag = Tag(compact.Spanish)
|
||||
EuropeanSpanish Tag = Tag(compact.EuropeanSpanish)
|
||||
LatinAmericanSpanish Tag = Tag(compact.LatinAmericanSpanish)
|
||||
Estonian Tag = Tag(compact.Estonian)
|
||||
Persian Tag = Tag(compact.Persian)
|
||||
Finnish Tag = Tag(compact.Finnish)
|
||||
Filipino Tag = Tag(compact.Filipino)
|
||||
French Tag = Tag(compact.French)
|
||||
CanadianFrench Tag = Tag(compact.CanadianFrench)
|
||||
Gujarati Tag = Tag(compact.Gujarati)
|
||||
Hebrew Tag = Tag(compact.Hebrew)
|
||||
Hindi Tag = Tag(compact.Hindi)
|
||||
Croatian Tag = Tag(compact.Croatian)
|
||||
Hungarian Tag = Tag(compact.Hungarian)
|
||||
Armenian Tag = Tag(compact.Armenian)
|
||||
Indonesian Tag = Tag(compact.Indonesian)
|
||||
Icelandic Tag = Tag(compact.Icelandic)
|
||||
Italian Tag = Tag(compact.Italian)
|
||||
Japanese Tag = Tag(compact.Japanese)
|
||||
Georgian Tag = Tag(compact.Georgian)
|
||||
Kazakh Tag = Tag(compact.Kazakh)
|
||||
Khmer Tag = Tag(compact.Khmer)
|
||||
Kannada Tag = Tag(compact.Kannada)
|
||||
Korean Tag = Tag(compact.Korean)
|
||||
Kirghiz Tag = Tag(compact.Kirghiz)
|
||||
Lao Tag = Tag(compact.Lao)
|
||||
Lithuanian Tag = Tag(compact.Lithuanian)
|
||||
Latvian Tag = Tag(compact.Latvian)
|
||||
Macedonian Tag = Tag(compact.Macedonian)
|
||||
Malayalam Tag = Tag(compact.Malayalam)
|
||||
Mongolian Tag = Tag(compact.Mongolian)
|
||||
Marathi Tag = Tag(compact.Marathi)
|
||||
Malay Tag = Tag(compact.Malay)
|
||||
Burmese Tag = Tag(compact.Burmese)
|
||||
Nepali Tag = Tag(compact.Nepali)
|
||||
Dutch Tag = Tag(compact.Dutch)
|
||||
Norwegian Tag = Tag(compact.Norwegian)
|
||||
Punjabi Tag = Tag(compact.Punjabi)
|
||||
Polish Tag = Tag(compact.Polish)
|
||||
Portuguese Tag = Tag(compact.Portuguese)
|
||||
BrazilianPortuguese Tag = Tag(compact.BrazilianPortuguese)
|
||||
EuropeanPortuguese Tag = Tag(compact.EuropeanPortuguese)
|
||||
Romanian Tag = Tag(compact.Romanian)
|
||||
Russian Tag = Tag(compact.Russian)
|
||||
Sinhala Tag = Tag(compact.Sinhala)
|
||||
Slovak Tag = Tag(compact.Slovak)
|
||||
Slovenian Tag = Tag(compact.Slovenian)
|
||||
Albanian Tag = Tag(compact.Albanian)
|
||||
Serbian Tag = Tag(compact.Serbian)
|
||||
SerbianLatin Tag = Tag(compact.SerbianLatin)
|
||||
Swedish Tag = Tag(compact.Swedish)
|
||||
Swahili Tag = Tag(compact.Swahili)
|
||||
Tamil Tag = Tag(compact.Tamil)
|
||||
Telugu Tag = Tag(compact.Telugu)
|
||||
Thai Tag = Tag(compact.Thai)
|
||||
Turkish Tag = Tag(compact.Turkish)
|
||||
Ukrainian Tag = Tag(compact.Ukrainian)
|
||||
Urdu Tag = Tag(compact.Urdu)
|
||||
Uzbek Tag = Tag(compact.Uzbek)
|
||||
Vietnamese Tag = Tag(compact.Vietnamese)
|
||||
Chinese Tag = Tag(compact.Chinese)
|
||||
SimplifiedChinese Tag = Tag(compact.SimplifiedChinese)
|
||||
TraditionalChinese Tag = Tag(compact.TraditionalChinese)
|
||||
Zulu Tag = Tag(compact.Zulu)
|
||||
)
|
|
@ -903,6 +903,12 @@ golang.org/x/sys/windows/registry
|
|||
golang.org/x/term
|
||||
# golang.org/x/text v0.14.0
|
||||
## explicit; go 1.18
|
||||
golang.org/x/text/cases
|
||||
golang.org/x/text/internal
|
||||
golang.org/x/text/internal/language
|
||||
golang.org/x/text/internal/language/compact
|
||||
golang.org/x/text/internal/tag
|
||||
golang.org/x/text/language
|
||||
golang.org/x/text/secure/bidirule
|
||||
golang.org/x/text/transform
|
||||
golang.org/x/text/unicode/bidi
|
||||
|
|
Loading…
Reference in New Issue