mirror of
https://github.com/openclaw/openclaw.git
synced 2026-02-09 05:19:32 +08:00
161 lines
3.5 KiB
Go
161 lines
3.5 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"io"
|
|
"strings"
|
|
|
|
"github.com/yuin/goldmark"
|
|
"github.com/yuin/goldmark/ast"
|
|
"github.com/yuin/goldmark/extension"
|
|
"github.com/yuin/goldmark/text"
|
|
"golang.org/x/net/html"
|
|
"sort"
|
|
)
|
|
|
|
type htmlReplacement struct {
|
|
Start int
|
|
Stop int
|
|
Value string
|
|
}
|
|
|
|
func translateHTMLBlocks(ctx context.Context, translator *PiTranslator, body, srcLang, tgtLang string) (string, error) {
|
|
source := []byte(body)
|
|
r := text.NewReader(source)
|
|
md := goldmark.New(
|
|
goldmark.WithExtensions(extension.GFM),
|
|
)
|
|
doc := md.Parser().Parse(r)
|
|
|
|
replacements := make([]htmlReplacement, 0, 8)
|
|
|
|
_ = ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
|
|
if !entering {
|
|
return ast.WalkContinue, nil
|
|
}
|
|
block, ok := n.(*ast.HTMLBlock)
|
|
if !ok {
|
|
return ast.WalkContinue, nil
|
|
}
|
|
start, stop, ok := htmlBlockSpan(block, source)
|
|
if !ok {
|
|
return ast.WalkSkipChildren, nil
|
|
}
|
|
htmlText := string(source[start:stop])
|
|
translated, err := translateHTMLBlock(ctx, translator, htmlText, srcLang, tgtLang)
|
|
if err != nil {
|
|
return ast.WalkStop, err
|
|
}
|
|
replacements = append(replacements, htmlReplacement{Start: start, Stop: stop, Value: translated})
|
|
return ast.WalkSkipChildren, nil
|
|
})
|
|
|
|
if len(replacements) == 0 {
|
|
return body, nil
|
|
}
|
|
|
|
return applyHTMLReplacements(body, replacements), nil
|
|
}
|
|
|
|
func htmlBlockSpan(block *ast.HTMLBlock, source []byte) (int, int, bool) {
|
|
lines := block.Lines()
|
|
if lines.Len() == 0 {
|
|
return 0, 0, false
|
|
}
|
|
start := lines.At(0).Start
|
|
stop := lines.At(lines.Len() - 1).Stop
|
|
if start >= stop {
|
|
return 0, 0, false
|
|
}
|
|
return start, stop, true
|
|
}
|
|
|
|
func applyHTMLReplacements(body string, replacements []htmlReplacement) string {
|
|
if len(replacements) == 0 {
|
|
return body
|
|
}
|
|
sortHTMLReplacements(replacements)
|
|
var out strings.Builder
|
|
last := 0
|
|
for _, rep := range replacements {
|
|
if rep.Start < last {
|
|
continue
|
|
}
|
|
out.WriteString(body[last:rep.Start])
|
|
out.WriteString(rep.Value)
|
|
last = rep.Stop
|
|
}
|
|
out.WriteString(body[last:])
|
|
return out.String()
|
|
}
|
|
|
|
func sortHTMLReplacements(replacements []htmlReplacement) {
|
|
sort.Slice(replacements, func(i, j int) bool {
|
|
return replacements[i].Start < replacements[j].Start
|
|
})
|
|
}
|
|
|
|
func translateHTMLBlock(ctx context.Context, translator *PiTranslator, htmlText, srcLang, tgtLang string) (string, error) {
|
|
tokenizer := html.NewTokenizer(strings.NewReader(htmlText))
|
|
var out strings.Builder
|
|
skipDepth := 0
|
|
|
|
for {
|
|
tt := tokenizer.Next()
|
|
if tt == html.ErrorToken {
|
|
if err := tokenizer.Err(); err != nil && err != io.EOF {
|
|
return "", err
|
|
}
|
|
break
|
|
}
|
|
|
|
raw := string(tokenizer.Raw())
|
|
tok := tokenizer.Token()
|
|
|
|
switch tt {
|
|
case html.StartTagToken:
|
|
out.WriteString(raw)
|
|
if isSkipTag(strings.ToLower(tok.Data)) {
|
|
skipDepth++
|
|
}
|
|
case html.EndTagToken:
|
|
out.WriteString(raw)
|
|
if isSkipTag(strings.ToLower(tok.Data)) && skipDepth > 0 {
|
|
skipDepth--
|
|
}
|
|
case html.SelfClosingTagToken:
|
|
out.WriteString(raw)
|
|
case html.TextToken:
|
|
if shouldTranslateHTMLText(skipDepth, raw) {
|
|
translated, err := translator.Translate(ctx, raw, srcLang, tgtLang)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
out.WriteString(translated)
|
|
} else {
|
|
out.WriteString(raw)
|
|
}
|
|
default:
|
|
out.WriteString(raw)
|
|
}
|
|
}
|
|
|
|
return out.String(), nil
|
|
}
|
|
|
|
func shouldTranslateHTMLText(skipDepth int, text string) bool {
|
|
if strings.TrimSpace(text) == "" {
|
|
return false
|
|
}
|
|
return skipDepth == 0
|
|
}
|
|
|
|
func isSkipTag(tag string) bool {
|
|
switch tag {
|
|
case "code", "pre", "script", "style":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|