refactor(data):重构财务数据爬取功能

- 移除雪球爬虫测试,改为 sina 和 eastmoney 测试
- 新增eastmoney财务数据爬取支持
- 优化openai_api.go中的财务报告获取逻辑
- 使用通用爬虫API替代chromedp实现
This commit is contained in:
ArvinLovegood 2025-03-31 14:05:04 +08:00
parent 5f8556cc3d
commit f1e40e7d3b
2 changed files with 56 additions and 79 deletions

View File

@ -298,7 +298,7 @@ func TestUSSINA(t *testing.T) {
})
}
func TestXueqiu(t *testing.T) {
func TestSina(t *testing.T) {
db.Init("../../data/stock.db")
url := "https://finance.sina.com.cn/realstock/company/sz002906/nc.shtml"
crawlerAPI := CrawlerApi{}
@ -331,6 +331,34 @@ func TestXueqiu(t *testing.T) {
}
func TestDC(t *testing.T) {
url := "https://emweb.securities.eastmoney.com/pc_hsf10/pages/index.html?type=web&code=sh600745#/cwfx"
db.Init("../../data/stock.db")
crawlerAPI := CrawlerApi{}
crawlerBaseInfo := CrawlerBaseInfo{
Name: "TestCrawler",
Description: "Test Crawler Description",
BaseUrl: "https://emweb.securities.eastmoney.com",
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
defer cancel()
crawlerAPI = crawlerAPI.NewCrawler(ctx, crawlerBaseInfo)
var markdown strings.Builder
markdown.WriteString("\n ## 财务数据:\n")
html, ok := crawlerAPI.GetHtml(url, "div.report_table table", false)
if !ok {
return
}
document, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
logger.SugaredLogger.Error(err.Error())
}
GetTableMarkdown(document, "div.report_table table", &markdown)
}
type Tick struct {
Code int `json:"code"`
Status string `json:"status"`

View File

@ -499,99 +499,48 @@ func SearchGuShiTongStockInfo(stock string, crawlTimeOut int64) *[]string {
}
func GetFinancialReports(stockCode string, crawlTimeOut int64) *[]string {
url := "https://emweb.securities.eastmoney.com/pc_hsf10/pages/index.html?type=web&code=" + stockCode + "#/cwfx"
waitVisible := "div.report_table table"
if strutil.HasPrefixAny(stockCode, []string{"HK", "hk"}) {
stockCode = strings.ReplaceAll(stockCode, "hk", "")
stockCode = strings.ReplaceAll(stockCode, "HK", "")
url = "https://emweb.securities.eastmoney.com/PC_HKF10/pages/home/index.html?code=" + stockCode + "&type=web&color=w#/NewFinancialAnalysis"
waitVisible = "div table.commonTable"
}
if strutil.HasPrefixAny(stockCode, []string{"us", "gb_"}) {
stockCode = strings.ReplaceAll(stockCode, "us", "")
stockCode = strings.ReplaceAll(stockCode, "gb_", "")
url = "https://emweb.securities.eastmoney.com/pc_usf10/pages/index.html?type=web&code=" + stockCode + "#/cwfx"
waitVisible = "div.zyzb_table_detail table"
}
// 创建一个 chromedp 上下文
timeoutCtx, timeoutCtxCancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second)
defer timeoutCtxCancel()
var ctx context.Context
var cancel context.CancelFunc
path := getConfig().BrowserPath
logger.SugaredLogger.Infof("GetFinancialReports path:%s", path)
logger.SugaredLogger.Infof("GetFinancialReports搜索股票-%s: %s", stockCode, url)
if path != "" {
pctx, pcancel := chromedp.NewExecAllocator(
timeoutCtx,
chromedp.ExecPath(path),
chromedp.Flag("headless", true),
chromedp.Flag("disable-javascript", false),
chromedp.Flag("disable-gpu", true),
chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"),
chromedp.Flag("disable-background-networking", true),
chromedp.Flag("enable-features", "NetworkService,NetworkServiceInProcess"),
chromedp.Flag("disable-background-timer-throttling", true),
chromedp.Flag("disable-backgrounding-occluded-windows", true),
chromedp.Flag("disable-breakpad", true),
chromedp.Flag("disable-client-side-phishing-detection", true),
chromedp.Flag("disable-default-apps", true),
chromedp.Flag("disable-dev-shm-usage", true),
chromedp.Flag("disable-extensions", true),
chromedp.Flag("disable-features", "site-per-process,Translate,BlinkGenPropertyTrees"),
chromedp.Flag("disable-hang-monitor", true),
chromedp.Flag("disable-ipc-flooding-protection", true),
chromedp.Flag("disable-popup-blocking", true),
chromedp.Flag("disable-prompt-on-repost", true),
chromedp.Flag("disable-renderer-backgrounding", true),
chromedp.Flag("disable-sync", true),
chromedp.Flag("force-color-profile", "srgb"),
chromedp.Flag("metrics-recording-only", true),
chromedp.Flag("safebrowsing-disable-auto-update", true),
chromedp.Flag("enable-automation", true),
chromedp.Flag("password-store", "basic"),
chromedp.Flag("use-mock-keychain", true),
)
defer pcancel()
ctx, cancel = chromedp.NewContext(
pctx,
chromedp.WithLogf(logger.SugaredLogger.Infof),
chromedp.WithErrorf(logger.SugaredLogger.Errorf),
)
} else {
ctx, cancel = chromedp.NewContext(
timeoutCtx,
chromedp.WithLogf(logger.SugaredLogger.Infof),
chromedp.WithErrorf(logger.SugaredLogger.Errorf),
)
db.Init("../../data/stock.db")
crawlerAPI := CrawlerApi{}
crawlerBaseInfo := CrawlerBaseInfo{
Name: "TestCrawler",
Description: "Test Crawler Description",
BaseUrl: "https://emweb.securities.eastmoney.com",
Headers: map[string]string{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0"},
}
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(crawlTimeOut)*time.Second)
defer cancel()
var htmlContent string
url := fmt.Sprintf("https://xueqiu.com/snowman/S/%s/detail#/ZYCWZB", stockCode)
err := chromedp.Run(ctx,
chromedp.Navigate(url),
// 等待页面加载完成,可以根据需要调整等待时间
chromedp.WaitVisible("table.table", chromedp.ByQuery),
chromedp.OuterHTML("html", &htmlContent, chromedp.ByQuery),
)
crawlerAPI = crawlerAPI.NewCrawler(ctx, crawlerBaseInfo)
var markdown strings.Builder
markdown.WriteString("\n## 财务数据:\n")
html, ok := crawlerAPI.GetHtml(url, waitVisible, true)
if !ok {
return &[]string{""}
}
document, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
logger.SugaredLogger.Error(err.Error())
}
document, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
logger.SugaredLogger.Error(err.Error())
return &[]string{}
}
var messages []string
document.Find("table tr").Each(func(i int, selection *goquery.Selection) {
tr := ""
selection.Find("th,td").Each(func(i int, selection *goquery.Selection) {
ret := selection.Find("p").First().Text()
if ret == "" {
ret = selection.Text()
}
text := strutil.RemoveNonPrintable(ret)
tr += text + " "
})
logger.SugaredLogger.Infof("%s", tr+" \n")
messages = append(messages, tr+" \n")
})
return &messages
GetTableMarkdown(document, waitVisible, &markdown)
return &[]string{markdown.String()}
}
func GetTelegraphList(crawlTimeOut int64) *[]string {