mirror of
https://codeberg.org/forgejo/forgejo.git
synced 2025-01-01 04:13:59 -05:00
72 lines
1.3 KiB
Go
72 lines
1.3 KiB
Go
|
package chardet
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
)
|
||
|
|
||
|
var utf8Bom = []byte{0xEF, 0xBB, 0xBF}
|
||
|
|
||
|
type recognizerUtf8 struct {
|
||
|
}
|
||
|
|
||
|
func newRecognizer_utf8() *recognizerUtf8 {
|
||
|
return &recognizerUtf8{}
|
||
|
}
|
||
|
|
||
|
func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
|
||
|
output = recognizerOutput{
|
||
|
Charset: "UTF-8",
|
||
|
}
|
||
|
hasBom := bytes.HasPrefix(input.raw, utf8Bom)
|
||
|
inputLen := len(input.raw)
|
||
|
var numValid, numInvalid uint32
|
||
|
var trailBytes uint8
|
||
|
for i := 0; i < inputLen; i++ {
|
||
|
c := input.raw[i]
|
||
|
if c&0x80 == 0 {
|
||
|
continue
|
||
|
}
|
||
|
if c&0xE0 == 0xC0 {
|
||
|
trailBytes = 1
|
||
|
} else if c&0xF0 == 0xE0 {
|
||
|
trailBytes = 2
|
||
|
} else if c&0xF8 == 0xF0 {
|
||
|
trailBytes = 3
|
||
|
} else {
|
||
|
numInvalid++
|
||
|
if numInvalid > 5 {
|
||
|
break
|
||
|
}
|
||
|
trailBytes = 0
|
||
|
}
|
||
|
|
||
|
for i++; i < inputLen; i++ {
|
||
|
c = input.raw[i]
|
||
|
if c&0xC0 != 0x80 {
|
||
|
numInvalid++
|
||
|
break
|
||
|
}
|
||
|
if trailBytes--; trailBytes == 0 {
|
||
|
numValid++
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if hasBom && numInvalid == 0 {
|
||
|
output.Confidence = 100
|
||
|
} else if hasBom && numValid > numInvalid*10 {
|
||
|
output.Confidence = 80
|
||
|
} else if numValid > 3 && numInvalid == 0 {
|
||
|
output.Confidence = 100
|
||
|
} else if numValid > 0 && numInvalid == 0 {
|
||
|
output.Confidence = 80
|
||
|
} else if numValid == 0 && numInvalid == 0 {
|
||
|
// Plain ASCII
|
||
|
output.Confidence = 10
|
||
|
} else if numValid > numInvalid*10 {
|
||
|
output.Confidence = 25
|
||
|
}
|
||
|
return
|
||
|
}
|