feat(tazjin/tgsa): support extracting fallback message from preview
some telegram channels do not allow embedding of messages, but do allow a preview to be shown on twitter. this preview is just embedded in the html, and can be scraped out if no message was found. technically this preview also contains image links, but they are to very low resolution, thumbnail-style images so i decided not to include them here. Change-Id: Ifb89f9fbde8140d577a5ee3af6e60b04232e53e3 Reviewed-on: https://cl.tvl.fyi/c/depot/+/8480 Autosubmit: tazjin <tazjin@tvl.su> Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
cd80c00a6b
commit
eadcfbbfab
1 changed files with 37 additions and 11 deletions
|
@ -2,6 +2,7 @@ use anyhow::{anyhow, Context, Result};
|
|||
use std::collections::HashMap;
|
||||
use std::sync::RwLock;
|
||||
use std::time::{Duration, Instant};
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
|
||||
struct TgLink {
|
||||
|
@ -14,8 +15,8 @@ impl TgLink {
|
|||
format!("t.me/{}/{}", self.username, self.message_id)
|
||||
}
|
||||
|
||||
fn to_url(&self) -> String {
|
||||
format!("https://t.me/{}/{}?embed=1", self.username, self.message_id)
|
||||
fn to_url(&self, embed: bool) -> String {
|
||||
format!("https://t.me/{}/{}{}", self.username, self.message_id, if embed { "?embed=1" } else { "" })
|
||||
}
|
||||
|
||||
fn parse(url: &str) -> Option<Self> {
|
||||
|
@ -40,9 +41,9 @@ impl TgLink {
|
|||
}
|
||||
}
|
||||
|
||||
fn fetch_embed(link: &TgLink) -> Result<String> {
|
||||
fn fetch_post(link: &TgLink, embed: bool) -> Result<String> {
|
||||
println!("fetching {}#{}", link.username, link.message_id);
|
||||
let response = crimp::Request::get(&link.to_url())
|
||||
let response = crimp::Request::get(&link.to_url(embed))
|
||||
.send()
|
||||
.context("failed to fetch embed data")?
|
||||
.as_string()
|
||||
|
@ -54,6 +55,28 @@ fn fetch_embed(link: &TgLink) -> Result<String> {
|
|||
Ok(response.body)
|
||||
}
|
||||
|
||||
// in some cases, posts can not be embedded, but telegram still
|
||||
// includes their content in metadata tags for content previews.
|
||||
//
|
||||
// we skip images in this case, as they are scaled down to thumbnail
|
||||
// size and not useful.
|
||||
fn fetch_fallback(link: &TgLink) -> Result<Option<String>> {
|
||||
let post = fetch_post(link, false)?;
|
||||
let doc = Html::parse_document(&post);
|
||||
let desc_sel = Selector::parse("meta[property=\"og:description\"]").unwrap();
|
||||
let desc_elem = match doc.select(&desc_sel).next() {
|
||||
None => return Ok(None),
|
||||
Some(elem) => elem,
|
||||
};
|
||||
|
||||
let content = match desc_elem.value().attr("content") {
|
||||
None => return Ok(None),
|
||||
Some(content) => content.to_string(),
|
||||
};
|
||||
|
||||
return Ok(Some(content));
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TgMessage {
|
||||
author: String,
|
||||
|
@ -71,8 +94,6 @@ fn extract_photo_url(style: &str) -> Option<&str> {
|
|||
}
|
||||
|
||||
fn parse_tgmessage(embed: &str) -> Result<TgMessage> {
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
let doc = Html::parse_document(embed);
|
||||
|
||||
let author_sel = Selector::parse("a.tgme_widget_message_owner_name").unwrap();
|
||||
|
@ -164,7 +185,7 @@ fn to_bbcode(link: &TgLink, msg: &TgMessage) -> String {
|
|||
out.push_str(&format!("[quote=\"{}\"]\n", msg.author));
|
||||
|
||||
for video in 0..msg.videos.len() {
|
||||
out.push_str(&format!("[url=\"{}\"]", link.to_url()));
|
||||
out.push_str(&format!("[url=\"{}\"]", link.to_url(true)));
|
||||
|
||||
// video thumbnail links are appended to the photos, hence the
|
||||
// addition here
|
||||
|
@ -184,7 +205,7 @@ fn to_bbcode(link: &TgLink, msg: &TgMessage) -> String {
|
|||
if msg.has_audio {
|
||||
out.push_str(&format!(
|
||||
"[i]This message has audio attached. Go [url=\"{}\"]to Telegram[/url] to listen.[/i]",
|
||||
link.to_url(),
|
||||
link.to_url(true),
|
||||
));
|
||||
}
|
||||
|
||||
|
@ -196,7 +217,7 @@ fn to_bbcode(link: &TgLink, msg: &TgMessage) -> String {
|
|||
|
||||
out.push_str(&format!(
|
||||
"[sub](from [url=\"{}\"]{}[/url], via [url=\"https://tgsa.tazj.in\"]tgsa[/url])[/sub]\n",
|
||||
link.to_url(),
|
||||
link.to_url(true),
|
||||
link.human_friendly_url(),
|
||||
));
|
||||
|
||||
|
@ -227,8 +248,13 @@ fn fetch_with_cache(cache: &Cache, link: &TgLink) -> Result<TgPost> {
|
|||
// TODO(tazjin): per link?
|
||||
let mut writer = cache.write().unwrap();
|
||||
|
||||
let embed = fetch_embed(&link)?;
|
||||
let mut msg = parse_tgmessage(&embed)?;
|
||||
let post = fetch_post(&link, true)?;
|
||||
let mut msg = parse_tgmessage(&post)?;
|
||||
|
||||
if msg.message.is_none() {
|
||||
msg.message = fetch_fallback(&link)?;
|
||||
}
|
||||
|
||||
let bbcode = to_bbcode(&link, &msg);
|
||||
|
||||
let mut media = vec![];
|
||||
|
|
Loading…
Reference in a new issue