mirror of
https://codeberg.org/forgejo/forgejo.git
synced 2025-01-03 21:20:45 -05:00
27757714d0
* Move to goldmark Markdown rendering moved from blackfriday to the goldmark. Multiple subtle changes required to the goldmark extensions to keep current rendering and defaults. Can go further with goldmark linkify and have this work within markdown rendering making the link processor unnecessary. Need to think about how to go about allowing extensions - at present it seems that these would be hard to do without recompilation. * linter fixes Co-authored-by: Lauris BH <lauris@nix.lv>
897 lines
25 KiB
Go
897 lines
25 KiB
Go
// Package util provides utility functions for the goldmark.
|
|
package util
|
|
|
|
import (
|
|
"bytes"
|
|
"io"
|
|
"net/url"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// A CopyOnWriteBuffer is a byte buffer that copies buffer when
|
|
// it need to be changed.
|
|
type CopyOnWriteBuffer struct {
|
|
buffer []byte
|
|
copied bool
|
|
}
|
|
|
|
// NewCopyOnWriteBuffer returns a new CopyOnWriteBuffer.
|
|
func NewCopyOnWriteBuffer(buffer []byte) CopyOnWriteBuffer {
|
|
return CopyOnWriteBuffer{
|
|
buffer: buffer,
|
|
copied: false,
|
|
}
|
|
}
|
|
|
|
// Write writes given bytes to the buffer.
|
|
func (b *CopyOnWriteBuffer) Write(value []byte) {
|
|
if !b.copied {
|
|
b.buffer = make([]byte, 0, len(b.buffer)+20)
|
|
b.copied = true
|
|
}
|
|
b.buffer = append(b.buffer, value...)
|
|
}
|
|
|
|
// WriteByte writes the given byte to the buffer.
|
|
func (b *CopyOnWriteBuffer) WriteByte(c byte) {
|
|
if !b.copied {
|
|
b.buffer = make([]byte, 0, len(b.buffer)+20)
|
|
b.copied = true
|
|
}
|
|
b.buffer = append(b.buffer, c)
|
|
}
|
|
|
|
// Bytes returns bytes of this buffer.
|
|
func (b *CopyOnWriteBuffer) Bytes() []byte {
|
|
return b.buffer
|
|
}
|
|
|
|
// IsCopied returns true if buffer has been copied, otherwise false.
|
|
func (b *CopyOnWriteBuffer) IsCopied() bool {
|
|
return b.copied
|
|
}
|
|
|
|
// IsEscapedPunctuation returns true if caracter at a given index i
|
|
// is an escaped punctuation, otherwise false.
|
|
func IsEscapedPunctuation(source []byte, i int) bool {
|
|
return source[i] == '\\' && i < len(source)-1 && IsPunct(source[i+1])
|
|
}
|
|
|
|
// ReadWhile read the given source while pred is true.
|
|
func ReadWhile(source []byte, index [2]int, pred func(byte) bool) (int, bool) {
|
|
j := index[0]
|
|
ok := false
|
|
for ; j < index[1]; j++ {
|
|
c1 := source[j]
|
|
if pred(c1) {
|
|
ok = true
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
return j, ok
|
|
}
|
|
|
|
// IsBlank returns true if the given string is all space characters.
|
|
func IsBlank(bs []byte) bool {
|
|
for _, b := range bs {
|
|
if !IsSpace(b) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// VisualizeSpaces visualize invisible space characters.
|
|
func VisualizeSpaces(bs []byte) []byte {
|
|
bs = bytes.Replace(bs, []byte(" "), []byte("[SPACE]"), -1)
|
|
bs = bytes.Replace(bs, []byte("\t"), []byte("[TAB]"), -1)
|
|
bs = bytes.Replace(bs, []byte("\n"), []byte("[NEWLINE]\n"), -1)
|
|
bs = bytes.Replace(bs, []byte("\r"), []byte("[CR]"), -1)
|
|
return bs
|
|
}
|
|
|
|
// TabWidth calculates actual width of a tab at the given position.
|
|
func TabWidth(currentPos int) int {
|
|
return 4 - currentPos%4
|
|
}
|
|
|
|
// IndentPosition searches an indent position with the given width for the given line.
|
|
// If the line contains tab characters, paddings may be not zero.
|
|
// currentPos==0 and width==2:
|
|
//
|
|
// position: 0 1
|
|
// [TAB]aaaa
|
|
// width: 1234 5678
|
|
//
|
|
// width=2 is in the tab character. In this case, IndentPosition returns
|
|
// (pos=1, padding=2)
|
|
func IndentPosition(bs []byte, currentPos, width int) (pos, padding int) {
|
|
if width == 0 {
|
|
return 0, 0
|
|
}
|
|
w := 0
|
|
l := len(bs)
|
|
i := 0
|
|
hasTab := false
|
|
for ; i < l; i++ {
|
|
if bs[i] == '\t' {
|
|
w += TabWidth(currentPos + w)
|
|
hasTab = true
|
|
} else if bs[i] == ' ' {
|
|
w++
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
if w >= width {
|
|
if !hasTab {
|
|
return width, 0
|
|
}
|
|
return i, w - width
|
|
}
|
|
return -1, -1
|
|
}
|
|
|
|
// IndentPositionPadding searches an indent position with the given width for the given line.
|
|
// This function is mostly same as IndentPosition except this function
|
|
// takes account into additional paddings.
|
|
func IndentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) {
|
|
if width == 0 {
|
|
return 0, paddingv
|
|
}
|
|
w := 0
|
|
i := 0
|
|
l := len(bs)
|
|
for ; i < l; i++ {
|
|
if bs[i] == '\t' {
|
|
w += TabWidth(currentPos + w)
|
|
} else if bs[i] == ' ' {
|
|
w++
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
if w >= width {
|
|
return i - paddingv, w - width
|
|
}
|
|
return -1, -1
|
|
}
|
|
|
|
// DedentPosition dedents lines by the given width.
|
|
func DedentPosition(bs []byte, currentPos, width int) (pos, padding int) {
|
|
if width == 0 {
|
|
return 0, 0
|
|
}
|
|
w := 0
|
|
l := len(bs)
|
|
i := 0
|
|
for ; i < l; i++ {
|
|
if bs[i] == '\t' {
|
|
w += TabWidth(currentPos + w)
|
|
} else if bs[i] == ' ' {
|
|
w++
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
if w >= width {
|
|
return i, w - width
|
|
}
|
|
return i, 0
|
|
}
|
|
|
|
// DedentPositionPadding dedents lines by the given width.
|
|
// This function is mostly same as DedentPosition except this function
|
|
// takes account into additional paddings.
|
|
func DedentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) {
|
|
if width == 0 {
|
|
return 0, paddingv
|
|
}
|
|
|
|
w := 0
|
|
i := 0
|
|
l := len(bs)
|
|
for ; i < l; i++ {
|
|
if bs[i] == '\t' {
|
|
w += TabWidth(currentPos + w)
|
|
} else if bs[i] == ' ' {
|
|
w++
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
if w >= width {
|
|
return i - paddingv, w - width
|
|
}
|
|
return i - paddingv, 0
|
|
}
|
|
|
|
// IndentWidth calculate an indent width for the given line.
|
|
func IndentWidth(bs []byte, currentPos int) (width, pos int) {
|
|
l := len(bs)
|
|
for i := 0; i < l; i++ {
|
|
b := bs[i]
|
|
if b == ' ' {
|
|
width++
|
|
pos++
|
|
} else if b == '\t' {
|
|
width += TabWidth(currentPos + width)
|
|
pos++
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// FirstNonSpacePosition returns a potisoin line that is a first nonspace
|
|
// character.
|
|
func FirstNonSpacePosition(bs []byte) int {
|
|
i := 0
|
|
for ; i < len(bs); i++ {
|
|
c := bs[i]
|
|
if c == ' ' || c == '\t' {
|
|
continue
|
|
}
|
|
if c == '\n' {
|
|
return -1
|
|
}
|
|
return i
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// FindClosure returns a position that closes the given opener.
|
|
// If codeSpan is set true, it ignores characters in code spans.
|
|
// If allowNesting is set true, closures correspond to nested opener will be
|
|
// ignored.
|
|
func FindClosure(bs []byte, opener, closure byte, codeSpan, allowNesting bool) int {
|
|
i := 0
|
|
opened := 1
|
|
codeSpanOpener := 0
|
|
for i < len(bs) {
|
|
c := bs[i]
|
|
if codeSpan && codeSpanOpener != 0 && c == '`' {
|
|
codeSpanCloser := 0
|
|
for ; i < len(bs); i++ {
|
|
if bs[i] == '`' {
|
|
codeSpanCloser++
|
|
} else {
|
|
i--
|
|
break
|
|
}
|
|
}
|
|
if codeSpanCloser == codeSpanOpener {
|
|
codeSpanOpener = 0
|
|
}
|
|
} else if c == '\\' && i < len(bs)-1 && IsPunct(bs[i+1]) {
|
|
i += 2
|
|
continue
|
|
} else if codeSpan && codeSpanOpener == 0 && c == '`' {
|
|
for ; i < len(bs); i++ {
|
|
if bs[i] == '`' {
|
|
codeSpanOpener++
|
|
} else {
|
|
i--
|
|
break
|
|
}
|
|
}
|
|
} else if (codeSpan && codeSpanOpener == 0) || !codeSpan {
|
|
if c == closure {
|
|
opened--
|
|
if opened == 0 {
|
|
return i
|
|
}
|
|
} else if c == opener {
|
|
if !allowNesting {
|
|
return -1
|
|
}
|
|
opened++
|
|
}
|
|
}
|
|
i++
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// TrimLeft trims characters in the given s from head of the source.
|
|
// bytes.TrimLeft offers same functionalities, but bytes.TrimLeft
|
|
// allocates new buffer for the result.
|
|
func TrimLeft(source, b []byte) []byte {
|
|
i := 0
|
|
for ; i < len(source); i++ {
|
|
c := source[i]
|
|
found := false
|
|
for j := 0; j < len(b); j++ {
|
|
if c == b[j] {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
break
|
|
}
|
|
}
|
|
return source[i:]
|
|
}
|
|
|
|
// TrimRight trims characters in the given s from tail of the source.
|
|
func TrimRight(source, b []byte) []byte {
|
|
i := len(source) - 1
|
|
for ; i >= 0; i-- {
|
|
c := source[i]
|
|
found := false
|
|
for j := 0; j < len(b); j++ {
|
|
if c == b[j] {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
break
|
|
}
|
|
}
|
|
return source[:i+1]
|
|
}
|
|
|
|
// TrimLeftLength returns a length of leading specified characters.
|
|
func TrimLeftLength(source, s []byte) int {
|
|
return len(source) - len(TrimLeft(source, s))
|
|
}
|
|
|
|
// TrimRightLength returns a length of trailing specified characters.
|
|
func TrimRightLength(source, s []byte) int {
|
|
return len(source) - len(TrimRight(source, s))
|
|
}
|
|
|
|
// TrimLeftSpaceLength returns a length of leading space characters.
|
|
func TrimLeftSpaceLength(source []byte) int {
|
|
i := 0
|
|
for ; i < len(source); i++ {
|
|
if !IsSpace(source[i]) {
|
|
break
|
|
}
|
|
}
|
|
return i
|
|
}
|
|
|
|
// TrimRightSpaceLength returns a length of trailing space characters.
|
|
func TrimRightSpaceLength(source []byte) int {
|
|
l := len(source)
|
|
i := l - 1
|
|
for ; i >= 0; i-- {
|
|
if !IsSpace(source[i]) {
|
|
break
|
|
}
|
|
}
|
|
if i < 0 {
|
|
return l
|
|
}
|
|
return l - 1 - i
|
|
}
|
|
|
|
// TrimLeftSpace returns a subslice of the given string by slicing off all leading
|
|
// space characters.
|
|
func TrimLeftSpace(source []byte) []byte {
|
|
return TrimLeft(source, spaces)
|
|
}
|
|
|
|
// TrimRightSpace returns a subslice of the given string by slicing off all trailing
|
|
// space characters.
|
|
func TrimRightSpace(source []byte) []byte {
|
|
return TrimRight(source, spaces)
|
|
}
|
|
|
|
// ReplaceSpaces replaces sequence of spaces with the given repl.
|
|
func ReplaceSpaces(source []byte, repl byte) []byte {
|
|
var ret []byte
|
|
start := -1
|
|
for i, c := range source {
|
|
iss := IsSpace(c)
|
|
if start < 0 && iss {
|
|
start = i
|
|
continue
|
|
} else if start >= 0 && iss {
|
|
continue
|
|
} else if start >= 0 {
|
|
if ret == nil {
|
|
ret = make([]byte, 0, len(source))
|
|
ret = append(ret, source[:start]...)
|
|
}
|
|
ret = append(ret, repl)
|
|
start = -1
|
|
}
|
|
if ret != nil {
|
|
ret = append(ret, c)
|
|
}
|
|
}
|
|
if start >= 0 && ret != nil {
|
|
ret = append(ret, repl)
|
|
}
|
|
if ret == nil {
|
|
return source
|
|
}
|
|
return ret
|
|
}
|
|
|
|
// ToRune decode given bytes start at pos and returns a rune.
|
|
func ToRune(source []byte, pos int) rune {
|
|
i := pos
|
|
for ; i >= 0; i-- {
|
|
if utf8.RuneStart(source[i]) {
|
|
break
|
|
}
|
|
}
|
|
r, _ := utf8.DecodeRune(source[i:])
|
|
return r
|
|
}
|
|
|
|
// ToValidRune returns 0xFFFD if the given rune is invalid, otherwise v.
|
|
func ToValidRune(v rune) rune {
|
|
if v == 0 || !utf8.ValidRune(v) {
|
|
return rune(0xFFFD)
|
|
}
|
|
return v
|
|
}
|
|
|
|
// ToLinkReference convert given bytes into a valid link reference string.
|
|
// ToLinkReference trims leading and trailing spaces and convert into lower
|
|
// case and replace spaces with a single space character.
|
|
func ToLinkReference(v []byte) string {
|
|
v = TrimLeftSpace(v)
|
|
v = TrimRightSpace(v)
|
|
return strings.ToLower(string(ReplaceSpaces(v, ' ')))
|
|
}
|
|
|
|
var htmlEscapeTable = [256][]byte{nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, []byte("""), nil, nil, nil, []byte("&"), nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, []byte("<"), nil, []byte(">"), nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil}
|
|
|
|
// EscapeHTMLByte returns HTML escaped bytes if the given byte should be escaped,
|
|
// otherwise nil.
|
|
func EscapeHTMLByte(b byte) []byte {
|
|
return htmlEscapeTable[b]
|
|
}
|
|
|
|
// EscapeHTML escapes characters that should be escaped in HTML text.
|
|
func EscapeHTML(v []byte) []byte {
|
|
cob := NewCopyOnWriteBuffer(v)
|
|
n := 0
|
|
for i := 0; i < len(v); i++ {
|
|
c := v[i]
|
|
escaped := htmlEscapeTable[c]
|
|
if escaped != nil {
|
|
cob.Write(v[n:i])
|
|
cob.Write(escaped)
|
|
n = i + 1
|
|
}
|
|
}
|
|
if cob.IsCopied() {
|
|
cob.Write(v[n:])
|
|
}
|
|
return cob.Bytes()
|
|
}
|
|
|
|
// UnescapePunctuations unescapes blackslash escaped punctuations.
|
|
func UnescapePunctuations(source []byte) []byte {
|
|
cob := NewCopyOnWriteBuffer(source)
|
|
limit := len(source)
|
|
n := 0
|
|
for i := 0; i < limit; {
|
|
c := source[i]
|
|
if i < limit-1 && c == '\\' && IsPunct(source[i+1]) {
|
|
cob.Write(source[n:i])
|
|
cob.WriteByte(source[i+1])
|
|
i += 2
|
|
n = i
|
|
continue
|
|
}
|
|
i++
|
|
}
|
|
if cob.IsCopied() {
|
|
cob.Write(source[n:])
|
|
}
|
|
return cob.Bytes()
|
|
}
|
|
|
|
// ResolveNumericReferences resolve numeric references like 'Ӓ" .
|
|
func ResolveNumericReferences(source []byte) []byte {
|
|
cob := NewCopyOnWriteBuffer(source)
|
|
buf := make([]byte, 6, 6)
|
|
limit := len(source)
|
|
ok := false
|
|
n := 0
|
|
for i := 0; i < limit; i++ {
|
|
if source[i] == '&' {
|
|
pos := i
|
|
next := i + 1
|
|
if next < limit && source[next] == '#' {
|
|
nnext := next + 1
|
|
if nnext < limit {
|
|
nc := source[nnext]
|
|
// code point like #x22;
|
|
if nnext < limit && nc == 'x' || nc == 'X' {
|
|
start := nnext + 1
|
|
i, ok = ReadWhile(source, [2]int{start, limit}, IsHexDecimal)
|
|
if ok && i < limit && source[i] == ';' {
|
|
v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 16, 32)
|
|
cob.Write(source[n:pos])
|
|
n = i + 1
|
|
runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v)))
|
|
cob.Write(buf[:runeSize])
|
|
continue
|
|
}
|
|
// code point like #1234;
|
|
} else if nc >= '0' && nc <= '9' {
|
|
start := nnext
|
|
i, ok = ReadWhile(source, [2]int{start, limit}, IsNumeric)
|
|
if ok && i < limit && i-start < 8 && source[i] == ';' {
|
|
v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 0, 32)
|
|
cob.Write(source[n:pos])
|
|
n = i + 1
|
|
runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v)))
|
|
cob.Write(buf[:runeSize])
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
i = next - 1
|
|
}
|
|
}
|
|
if cob.IsCopied() {
|
|
cob.Write(source[n:])
|
|
}
|
|
return cob.Bytes()
|
|
}
|
|
|
|
// ResolveEntityNames resolve entity references like 'ö" .
|
|
func ResolveEntityNames(source []byte) []byte {
|
|
cob := NewCopyOnWriteBuffer(source)
|
|
limit := len(source)
|
|
ok := false
|
|
n := 0
|
|
for i := 0; i < limit; i++ {
|
|
if source[i] == '&' {
|
|
pos := i
|
|
next := i + 1
|
|
if !(next < limit && source[next] == '#') {
|
|
start := next
|
|
i, ok = ReadWhile(source, [2]int{start, limit}, IsAlphaNumeric)
|
|
if ok && i < limit && source[i] == ';' {
|
|
name := BytesToReadOnlyString(source[start:i])
|
|
entity, ok := LookUpHTML5EntityByName(name)
|
|
if ok {
|
|
cob.Write(source[n:pos])
|
|
n = i + 1
|
|
cob.Write(entity.Characters)
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
i = next - 1
|
|
}
|
|
}
|
|
if cob.IsCopied() {
|
|
cob.Write(source[n:])
|
|
}
|
|
return cob.Bytes()
|
|
}
|
|
|
|
var htmlSpace = []byte("%20")
|
|
|
|
// URLEscape escape the given URL.
|
|
// If resolveReference is set true:
|
|
// 1. unescape punctuations
|
|
// 2. resolve numeric references
|
|
// 3. resolve entity references
|
|
//
|
|
// URL encoded values (%xx) are keeped as is.
|
|
func URLEscape(v []byte, resolveReference bool) []byte {
|
|
if resolveReference {
|
|
v = UnescapePunctuations(v)
|
|
v = ResolveNumericReferences(v)
|
|
v = ResolveEntityNames(v)
|
|
}
|
|
cob := NewCopyOnWriteBuffer(v)
|
|
limit := len(v)
|
|
n := 0
|
|
|
|
for i := 0; i < limit; {
|
|
c := v[i]
|
|
if urlEscapeTable[c] == 1 {
|
|
i++
|
|
continue
|
|
}
|
|
if c == '%' && i+2 < limit && IsHexDecimal(v[i+1]) && IsHexDecimal(v[i+1]) {
|
|
i += 3
|
|
continue
|
|
}
|
|
u8len := utf8lenTable[c]
|
|
if u8len == 99 { // invalid utf8 leading byte, skip it
|
|
i++
|
|
continue
|
|
}
|
|
if c == ' ' {
|
|
cob.Write(v[n:i])
|
|
cob.Write(htmlSpace)
|
|
i++
|
|
n = i
|
|
continue
|
|
}
|
|
if int(u8len) >= len(v) {
|
|
u8len = int8(len(v) - 1)
|
|
}
|
|
if u8len == 0 {
|
|
i++
|
|
n = i
|
|
continue
|
|
}
|
|
cob.Write(v[n:i])
|
|
stop := i + int(u8len)
|
|
if stop > len(v) {
|
|
i++
|
|
n = i
|
|
continue
|
|
}
|
|
cob.Write(StringToReadOnlyBytes(url.QueryEscape(string(v[i:stop]))))
|
|
i += int(u8len)
|
|
n = i
|
|
}
|
|
if cob.IsCopied() && n < limit {
|
|
cob.Write(v[n:])
|
|
}
|
|
return cob.Bytes()
|
|
}
|
|
|
|
// FindURLIndex returns a stop index value if the given bytes seem an URL.
|
|
// This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* .
|
|
func FindURLIndex(b []byte) int {
|
|
i := 0
|
|
if !(len(b) > 0 && urlTable[b[i]]&7 == 7) {
|
|
return -1
|
|
}
|
|
i++
|
|
for ; i < len(b); i++ {
|
|
c := b[i]
|
|
if urlTable[c]&4 != 4 {
|
|
break
|
|
}
|
|
}
|
|
if i == 1 || i > 33 || i >= len(b) {
|
|
return -1
|
|
}
|
|
if b[i] != ':' {
|
|
return -1
|
|
}
|
|
i++
|
|
for ; i < len(b); i++ {
|
|
c := b[i]
|
|
if urlTable[c]&1 != 1 {
|
|
break
|
|
}
|
|
}
|
|
return i
|
|
}
|
|
|
|
var emailDomainRegexp = regexp.MustCompile(`^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*`)
|
|
|
|
// FindEmailIndex returns a stop index value if the given bytes seem an email address.
|
|
func FindEmailIndex(b []byte) int {
|
|
// TODO: eliminate regexps
|
|
i := 0
|
|
for ; i < len(b); i++ {
|
|
c := b[i]
|
|
if emailTable[c]&1 != 1 {
|
|
break
|
|
}
|
|
}
|
|
if i == 0 {
|
|
return -1
|
|
}
|
|
if i >= len(b) || b[i] != '@' {
|
|
return -1
|
|
}
|
|
i++
|
|
if i >= len(b) {
|
|
return -1
|
|
}
|
|
match := emailDomainRegexp.FindSubmatchIndex(b[i:])
|
|
if match == nil {
|
|
return -1
|
|
}
|
|
return i + match[1]
|
|
}
|
|
|
|
var spaces = []byte(" \t\n\x0b\x0c\x0d")
|
|
|
|
var spaceTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
|
|
|
|
var punctTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
|
|
|
|
// a-zA-Z0-9, ;/?:@&=+$,-_.!~*'()#
|
|
var urlEscapeTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
|
|
|
|
var utf8lenTable = [256]int8{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 99, 99, 99, 99, 99, 99, 99, 99}
|
|
|
|
var urlTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 1, 0, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
|
|
|
|
var emailTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
|
|
|
|
// UTF8Len returns a byte length of the utf-8 character.
|
|
func UTF8Len(b byte) int8 {
|
|
return utf8lenTable[b]
|
|
}
|
|
|
|
// IsPunct returns true if the given character is a punctuation, otherwise false.
|
|
func IsPunct(c byte) bool {
|
|
return punctTable[c] == 1
|
|
}
|
|
|
|
// IsSpace returns true if the given character is a space, otherwise false.
|
|
func IsSpace(c byte) bool {
|
|
return spaceTable[c] == 1
|
|
}
|
|
|
|
// IsNumeric returns true if the given character is a numeric, otherwise false.
|
|
func IsNumeric(c byte) bool {
|
|
return c >= '0' && c <= '9'
|
|
}
|
|
|
|
// IsHexDecimal returns true if the given character is a hexdecimal, otherwise false.
|
|
func IsHexDecimal(c byte) bool {
|
|
return c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F'
|
|
}
|
|
|
|
// IsAlphaNumeric returns true if the given character is a alphabet or a numeric, otherwise false.
|
|
func IsAlphaNumeric(c byte) bool {
|
|
return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9'
|
|
}
|
|
|
|
// A BufWriter is a subset of the bufio.Writer .
|
|
type BufWriter interface {
|
|
io.Writer
|
|
Available() int
|
|
Buffered() int
|
|
Flush() error
|
|
WriteByte(c byte) error
|
|
WriteRune(r rune) (size int, err error)
|
|
WriteString(s string) (int, error)
|
|
}
|
|
|
|
// A PrioritizedValue struct holds pair of an arbitrary value and a priority.
|
|
type PrioritizedValue struct {
|
|
// Value is an arbitrary value that you want to prioritize.
|
|
Value interface{}
|
|
// Priority is a priority of the value.
|
|
Priority int
|
|
}
|
|
|
|
// PrioritizedSlice is a slice of the PrioritizedValues
|
|
type PrioritizedSlice []PrioritizedValue
|
|
|
|
// Sort sorts the PrioritizedSlice in ascending order.
|
|
func (s PrioritizedSlice) Sort() {
|
|
sort.Slice(s, func(i, j int) bool {
|
|
return s[i].Priority < s[j].Priority
|
|
})
|
|
}
|
|
|
|
// Remove removes the given value from this slice.
|
|
func (s PrioritizedSlice) Remove(v interface{}) PrioritizedSlice {
|
|
i := 0
|
|
found := false
|
|
for ; i < len(s); i++ {
|
|
if s[i].Value == v {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
return s
|
|
}
|
|
return append(s[:i], s[i+1:]...)
|
|
}
|
|
|
|
// Prioritized returns a new PrioritizedValue.
|
|
func Prioritized(v interface{}, priority int) PrioritizedValue {
|
|
return PrioritizedValue{v, priority}
|
|
}
|
|
|
|
func bytesHash(b []byte) uint64 {
|
|
var hash uint64 = 5381
|
|
for _, c := range b {
|
|
hash = ((hash << 5) + hash) + uint64(c)
|
|
}
|
|
return hash
|
|
}
|
|
|
|
// BytesFilter is a efficient data structure for checking whether bytes exist or not.
|
|
// BytesFilter is thread-safe.
|
|
type BytesFilter interface {
|
|
// Add adds given bytes to this set.
|
|
Add([]byte)
|
|
|
|
// Contains return true if this set contains given bytes, otherwise false.
|
|
Contains([]byte) bool
|
|
|
|
// Extend copies this filter and adds given bytes to new filter.
|
|
Extend(...[]byte) BytesFilter
|
|
}
|
|
|
|
type bytesFilter struct {
|
|
chars [256]uint8
|
|
threshold int
|
|
slots [][][]byte
|
|
}
|
|
|
|
// NewBytesFilter returns a new BytesFilter.
|
|
func NewBytesFilter(elements ...[]byte) BytesFilter {
|
|
s := &bytesFilter{
|
|
threshold: 3,
|
|
slots: make([][][]byte, 64),
|
|
}
|
|
for _, element := range elements {
|
|
s.Add(element)
|
|
}
|
|
return s
|
|
}
|
|
|
|
func (s *bytesFilter) Add(b []byte) {
|
|
l := len(b)
|
|
m := s.threshold
|
|
if l < s.threshold {
|
|
m = l
|
|
}
|
|
for i := 0; i < m; i++ {
|
|
s.chars[b[i]] |= 1 << uint8(i)
|
|
}
|
|
h := bytesHash(b) % uint64(len(s.slots))
|
|
slot := s.slots[h]
|
|
if slot == nil {
|
|
slot = [][]byte{}
|
|
}
|
|
s.slots[h] = append(slot, b)
|
|
}
|
|
|
|
func (s *bytesFilter) Extend(bs ...[]byte) BytesFilter {
|
|
newFilter := NewBytesFilter().(*bytesFilter)
|
|
newFilter.chars = s.chars
|
|
newFilter.threshold = s.threshold
|
|
for k, v := range s.slots {
|
|
newSlot := make([][]byte, len(v))
|
|
copy(newSlot, v)
|
|
newFilter.slots[k] = v
|
|
}
|
|
for _, b := range bs {
|
|
newFilter.Add(b)
|
|
}
|
|
return newFilter
|
|
}
|
|
|
|
func (s *bytesFilter) Contains(b []byte) bool {
|
|
l := len(b)
|
|
m := s.threshold
|
|
if l < s.threshold {
|
|
m = l
|
|
}
|
|
for i := 0; i < m; i++ {
|
|
if (s.chars[b[i]] & (1 << uint8(i))) == 0 {
|
|
return false
|
|
}
|
|
}
|
|
h := bytesHash(b) % uint64(len(s.slots))
|
|
slot := s.slots[h]
|
|
if slot == nil || len(slot) == 0 {
|
|
return false
|
|
}
|
|
for _, element := range slot {
|
|
if bytes.Equal(element, b) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|