From 1067b6016fd8feb4ccd1f17716e634e6db777266 Mon Sep 17 00:00:00 2001 From: Yakumo Hokori Date: Wed, 16 Jul 2025 14:54:21 +0800 Subject: [PATCH] =?UTF-8?q?feat(tms=5Fservice):=20=E6=B7=BB=E5=8A=A0html?= =?UTF-8?q?=5Fscraper=E6=A8=A1=E5=9D=97=E6=9B=BF=E4=BB=A3openai=E8=A7=A3?= =?UTF-8?q?=E6=9E=90=E9=93=BE=E6=8E=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 使用html_scraper模块通过CSS选择器直接解析HTML获取链接和token,替代之前依赖openai的方式 --- src-tauri/Cargo.lock | 10 +++++ src-tauri/html_scraper/Cargo.toml | 9 ++++ src-tauri/html_scraper/src/lib.rs | 73 +++++++++++++++++++++++++++++++ src-tauri/tms_service/Cargo.toml | 1 + src-tauri/tms_service/src/lib.rs | 21 ++++++--- 5 files changed, 108 insertions(+), 6 deletions(-) create mode 100644 src-tauri/html_scraper/Cargo.toml create mode 100644 src-tauri/html_scraper/src/lib.rs diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index 3aa22ab..fd02363 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -1593,6 +1593,15 @@ dependencies = [ "match_token", ] +[[package]] +name = "html_scraper" +version = "0.1.0" +dependencies = [ + "regex", + "scraper", + "thiserror 1.0.69", +] + [[package]] name = "http" version = "1.3.1" @@ -4458,6 +4467,7 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" name = "tms_service" version = "0.1.0" dependencies = [ + "html_scraper", "openai", "reqwest", "serde", diff --git a/src-tauri/html_scraper/Cargo.toml b/src-tauri/html_scraper/Cargo.toml new file mode 100644 index 0000000..ba90818 --- /dev/null +++ b/src-tauri/html_scraper/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "html_scraper" +version = "0.1.0" +edition = "2024" + +[dependencies] +scraper = "0.19.1" +thiserror = "1.0" +regex = "1.10.5" \ No newline at end of file diff --git a/src-tauri/html_scraper/src/lib.rs b/src-tauri/html_scraper/src/lib.rs new file mode 100644 index 0000000..36630fb --- /dev/null +++ b/src-tauri/html_scraper/src/lib.rs @@ -0,0 +1,73 @@ +use scraper::{Html, Selector}; +use thiserror::Error; +use regex::Regex; + +/// 定义库可能返回的错误类型 +#[derive(Debug, Error)] +pub enum ScrapeError { + /// 当内部使用的CSS选择器无效时返回。 + #[error("无效的CSS选择器: {0}")] + InvalidSelector(String), + /// 当正则表达式编译失败时返回。 + #[error("无效的正则表达式: {0}")] + InvalidRegex(#[from] regex::Error), +} + +/// 通用函数:根据CSS选择器和链接文本查找链接。 +fn find_link_by_text(html_body: &str, selector_str: &str, link_text: &str) -> Result, ScrapeError> { + let selector = Selector::parse(selector_str) + .map_err(|e| ScrapeError::InvalidSelector(format!("'{selector_str}': {e}")))?; + + let document = Html::parse_document(html_body); + + for element in document.select(&selector) { + if element.text().any(|text| text.trim() == link_text) { + if let Some(link) = element.value().attr("href") { + return Ok(Some(link.to_string())); + } + } + } + + Ok(None) +} + +/// 解析HTML文本,查找并返回第一个匹配“编辑”按钮的链接。 +pub fn find_edit_link(html_body: &str) -> Result, ScrapeError> { + find_link_by_text(html_body, "a.btn.btn-info.btn-xs.m-bot5", "编辑") +} + +/// 解析HTML文本,查找并返回“TMS配置”按钮的链接。 +pub fn find_tms_config_link(html_body: &str) -> Result, ScrapeError> { + find_link_by_text(html_body, "a.btn.btn-default.not-cinema", "TMS配置") +} + +/// # 参数 +/// * `html_body`: 一个字符串切片,包含要解析的HTML内容。 +pub fn find_token(html_body: &str) -> Result, ScrapeError> { + // 使用属性选择器直接、高效地定位目标元素 + let selector_str = r#"input[name="token"]"#; + let token_selector = Selector::parse(selector_str) + .map_err(|e| ScrapeError::InvalidSelector(format!("'{selector_str}': {e}")))?; + + let document = Html::parse_document(html_body); + + // 查找第一个匹配的元素并提取其 "value" 属性 + if let Some(element) = document.select(&token_selector).next() { + if let Some(token_value) = element.value().attr("value") { + return Ok(Some(token_value.to_string())); + } + } + + Ok(None) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} diff --git a/src-tauri/tms_service/Cargo.toml b/src-tauri/tms_service/Cargo.toml index a9e8eaa..bcca07c 100644 --- a/src-tauri/tms_service/Cargo.toml +++ b/src-tauri/tms_service/Cargo.toml @@ -9,3 +9,4 @@ tokio = { version = "1", features = ["full"] } serde = { version = "1", features = ["derive"] } serde_json = "1" openai = { path = "../openai" } +html_scraper = { path = "../html_scraper" } diff --git a/src-tauri/tms_service/src/lib.rs b/src-tauri/tms_service/src/lib.rs index d0d46d2..de68f19 100644 --- a/src-tauri/tms_service/src/lib.rs +++ b/src-tauri/tms_service/src/lib.rs @@ -40,8 +40,11 @@ pub async fn create_ticket(cookie: &str, n2p: &str, massageQ: &str, wx: Option<& println!("源码获取完成"); - let system_prompt = "告诉我编辑按钮的链接,只要告诉我链接的url就可以,不要说其他的任何内容"; - let edit_page_url = openai::ask_openai(&body, system_prompt).await?; + let edit_page_url = match html_scraper::find_edit_link(&body) { + Ok(Some(link)) => link, + Ok(None) => return Ok("没有找到编辑链接".to_string()), + Err(e) => return Ok(format!("查找编辑链接时出错: {}", e)), + }; println!("平台id{}", edit_page_url); // 3. Visit the edit page to get its source @@ -50,8 +53,11 @@ pub async fn create_ticket(cookie: &str, n2p: &str, massageQ: &str, wx: Option<& let body2 = String::from_utf8_lossy(&body2_bytes).to_string(); // 4. Get the TMS config URL - let system_prompt2 = "告诉我TMS配置按钮的链接,只要告诉我链接的url就可以,不要说其他的任何内容"; - let tms_config_url = openai::ask_openai(&body2, system_prompt2).await?; + let tms_config_url = match html_scraper::find_tms_config_link(&body2) { + Ok(Some(link)) => link, + Ok(None) => return Ok("没有找到TMS配置链接".to_string()), + Err(e) => return Ok(format!("查找TMS配置链接时出错: {}", e)), + }; println!("tms配置链接{}", tms_config_url); // 5. Visit the TMS config URL to get the token @@ -59,8 +65,11 @@ pub async fn create_ticket(cookie: &str, n2p: &str, massageQ: &str, wx: Option<& let body3_bytes = res3.bytes().await?; let body3 = String::from_utf8_lossy(&body3_bytes).to_string(); - let system_prompt3 = "告诉我权限认证(token)的值,只要告诉我对应的值就行,不要说其他的任何内容"; - let token = openai::ask_openai(&body3, system_prompt3).await?; + let token = match html_scraper::find_token(&body3) { + Ok(Some(t)) => t, + Ok(None) => return Ok("没有找到token".to_string()), + Err(e) => return Ok(format!("查找token时出错: {}", e)), + }; println!("token={}", token); // 6. Visit the final URL to get the session cookies