You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

87 lines
2.2 KiB
Go

/*
* Copyright 2024 CloudWeGo Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package main
import (
"context"
"os"
"github.com/cloudwego/eino-ext/components/document/parser/html"
"github.com/cloudwego/eino-ext/components/document/parser/pdf"
"github.com/cloudwego/eino/components/document/parser"
"github.com/cloudwego/eino-examples/internal/gptr"
"github.com/cloudwego/eino-examples/internal/logs"
)
func main() {
ctx := context.Background()
textParser := parser.TextParser{}
htmlParser, err := html.NewParser(ctx, &html.Config{
Selector: gptr.Of("body"),
})
if err != nil {
logs.Errorf("html.NewParser failed, err=%v", err)
return
}
pdfParser, err := pdf.NewPDFParser(ctx, &pdf.Config{})
if err != nil {
logs.Errorf("pdf.NewPDFParser failed, err=%v", err)
return
}
// 创建扩展解析器
extParser, err := parser.NewExtParser(ctx, &parser.ExtParserConfig{
// 注册特定扩展名的解析器
Parsers: map[string]parser.Parser{
".html": htmlParser,
".pdf": pdfParser,
},
// 设置默认解析器,用于处理未知格式
FallbackParser: textParser,
})
if err != nil {
return
}
// 使用解析器
filePath := "./testdata/test.html"
file, err := os.Open(filePath)
if err != nil {
logs.Errorf("os.Open failed, file=%v, err=%v", filePath, err)
return
}
docs, err := extParser.Parse(ctx, file,
// 必须提供 URI ExtParser 选择正确的解析器进行解析
parser.WithURI(filePath),
parser.WithExtraMeta(map[string]any{
"source": "local",
}),
)
if err != nil {
logs.Errorf("extParser.Parse, err=%v", err)
return
}
for idx, doc := range docs {
logs.Infof("doc_%v content: %v", idx, doc.Content)
}
}