feat(tazjin/tgsa): add gpt-3 powered message translation feature

this is slow and often overloaded, but it's kind of cool when it
works. this translation method deals much better with the kind of
slang you'd see in telegram posts than any other method.

Change-Id: I7e4c845eb382f0eac627c4237b492c8e40dae574
Reviewed-on: https://cl.tvl.fyi/c/depot/+/8625
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
This commit is contained in:
Vincent Ambo 2023-05-24 19:08:40 +03:00 committed by tazjin
parent 63047449d7
commit 38042ea445
3 changed files with 83 additions and 7 deletions

View file

@ -1075,6 +1075,8 @@ dependencies = [
"ego-tree", "ego-tree",
"rouille", "rouille",
"scraper", "scraper",
"serde",
"serde_json",
"url", "url",
] ]

View file

@ -10,3 +10,5 @@ rouille = { version = "3.5", default-features = false }
url = "2.3" url = "2.3"
scraper = "0.13" scraper = "0.13"
ego-tree = "0.6" # in tandem with 'scraper' ego-tree = "0.6" # in tandem with 'scraper'
serde = "1.0"
serde_json = "1.0"

View file

@ -1,13 +1,15 @@
use anyhow::{anyhow, Context, Result}; use anyhow::{anyhow, Context, Result};
use scraper::{Html, Selector};
use serde_json::Value;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::RwLock; use std::sync::RwLock;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use scraper::{Html, Selector};
#[derive(Clone, Debug, Eq, Hash, PartialEq)] #[derive(Clone, Debug, Eq, Hash, PartialEq)]
struct TgLink { struct TgLink {
username: String, username: String,
message_id: usize, message_id: usize,
translated: bool,
} }
impl TgLink { impl TgLink {
@ -16,10 +18,15 @@ impl TgLink {
} }
fn to_url(&self, embed: bool) -> String { fn to_url(&self, embed: bool) -> String {
format!("https://t.me/{}/{}{}", self.username, self.message_id, if embed { "?embed=1" } else { "" }) format!(
"https://t.me/{}/{}{}",
self.username,
self.message_id,
if embed { "?embed=1" } else { "" }
)
} }
fn parse(url: &str) -> Option<Self> { fn parse(url: &str, translated: bool) -> Option<Self> {
let url = url.strip_prefix("/")?; let url = url.strip_prefix("/")?;
let parsed = url::Url::parse(url).ok()?; let parsed = url::Url::parse(url).ok()?;
@ -37,6 +44,7 @@ impl TgLink {
Some(TgLink { Some(TgLink {
username: parts[0].into(), username: parts[0].into(),
message_id: parts[1].parse().ok()?, message_id: parts[1].parse().ok()?,
translated,
}) })
} }
} }
@ -55,6 +63,46 @@ fn fetch_post(link: &TgLink, embed: bool) -> Result<String> {
Ok(response.body) Ok(response.body)
} }
fn fetch_translation(message: &str) -> Result<String> {
let request = serde_json::json!({
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Please translate the following message from a Telegram channel into English. If the post is already partially in English, please leave those bits intact as they are. Please respond only with the translation."},
{"role": "user", "content": message}
]
});
let response: Value = crimp::Request::post("https://api.openai.com/v1/chat/completions")
.bearer_auth(&std::env::var("OPENAPI_KEY").context("no openapi key set")?)?
.json(&request)?
.send()
.context("failed to fetch translation from openai")?
.as_json::<Value>()?
.error_for_status(|resp| {
anyhow!(
"translation request failed: {} ({})",
resp.body,
resp.status
)
})?
.body;
// we want choices[0].message.content, and inshallah it's the right thing.
let translation = response
.get("choices")
.ok_or_else(|| anyhow!("missing 'choices' key"))?
.get(0)
.ok_or_else(|| anyhow!("empty 'choices' or something"))?
.get("message")
.ok_or_else(|| anyhow!("missing 'message' key"))?
.get("content")
.ok_or_else(|| anyhow!("missing 'content' key"))?
.as_str()
.ok_or_else(|| anyhow!("'content' was not a string"))?;
Ok(translation.to_string())
}
// in some cases, posts can not be embedded, but telegram still // in some cases, posts can not be embedded, but telegram still
// includes their content in metadata tags for content previews. // includes their content in metadata tags for content previews.
// //
@ -255,6 +303,12 @@ fn fetch_with_cache(cache: &Cache, link: &TgLink) -> Result<TgPost> {
msg.message = fetch_fallback(&link)?; msg.message = fetch_fallback(&link)?;
} }
if let Some(message) = &msg.message {
if link.translated {
msg.message = Some(fetch_translation(message)?);
}
}
let bbcode = to_bbcode(&link, &msg); let bbcode = to_bbcode(&link, &msg);
let mut media = vec![]; let mut media = vec![];
@ -292,6 +346,7 @@ fn handle_img_redirect(cache: &Cache, img_path: &str) -> Result<rouille::Respons
let link = TgLink { let link = TgLink {
username: img_parts[0].into(), username: img_parts[0].into(),
message_id: img_parts[1].parse().context("failed to parse message_id")?, message_id: img_parts[1].parse().context("failed to parse message_id")?,
translated: false,
}; };
let img_idx: usize = img_parts[2].parse().context("failed to parse img_idx")?; let img_idx: usize = img_parts[2].parse().context("failed to parse img_idx")?;
@ -320,12 +375,20 @@ fn main() {
let cache: Cache = RwLock::new(HashMap::new()); let cache: Cache = RwLock::new(HashMap::new());
rouille::start_server("0.0.0.0:8472", move |request| { rouille::start_server("0.0.0.0:8472", move |request| {
let mut raw_url = request.raw_url();
let mut translate = false;
let response = loop { let response = loop {
if request.raw_url().starts_with("/img/") { if raw_url.starts_with("/img/") {
break handle_img_redirect(&cache, &request.raw_url()[5..]); break handle_img_redirect(&cache, &raw_url[5..]);
} }
break match TgLink::parse(request.raw_url()) { if raw_url.starts_with("/translate/") {
translate = true;
raw_url = &raw_url[10..];
}
break match TgLink::parse(raw_url, translate) {
None => Ok(rouille::Response::text( None => Ok(rouille::Response::text(
r#"tgsa r#"tgsa
---- ----
@ -345,7 +408,16 @@ yes, that looks stupid, but it works
if you see this message and think you did the above correctly, you if you see this message and think you did the above correctly, you
didn't. try again. idiot. didn't. try again. idiot.
pm me on the forums if this makes you mad or something. it can also translate posts from russian, ukrainian or whatever other
dumb language you speak into english, by adding `/translate/`, for
example:
https://tgsa.tazj.in/translate/https://t.me/strelkovii/4329
expect this to be slow though. that's the price to pay for translating
shitty slang.
pm me on the forums if any of this makes you mad or something.
"#, "#,
)), )),
Some(link) => handle_tg_link(&cache, &link), Some(link) => handle_tg_link(&cache, &link),