Scraper.java

package jasper.component;

import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.mdimension.jchronic.Chronic;
import com.mdimension.jchronic.Options;
import com.mdimension.jchronic.tags.Pointer;
import io.micrometer.core.annotation.Timed;
import jakarta.persistence.EntityManager;
import jasper.component.dto.JsonLd;
import jasper.domain.Ref;
import jasper.plugin.Scrape;
import jasper.plugin.Video;
import jasper.repository.RefRepository;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Profile;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.net.URISyntaxException;
import java.time.Instant;
import java.time.format.DateTimeParseException;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import static jasper.domain.Ref.from;
import static jasper.domain.proj.HasTags.hasMedia;
import static jasper.util.Logging.getMessage;
import static org.apache.commons.lang3.StringUtils.isBlank;
import static org.apache.commons.lang3.StringUtils.isNotBlank;

@Profile("proxy | file-cache")
@Component
public class Scraper {
	private static final Logger logger = LoggerFactory.getLogger(Scraper.class);

	@Autowired
	ConfigCache configs;

	@Autowired
	Tagger tagger;

	@Autowired
	Proxy proxy;

	@Autowired
	Sanitizer sanitizer;

	@Autowired
	ObjectMapper objectMapper;

	@Autowired
	RefRepository refRepository;

	@Autowired
	EntityManager em;

	@Timed(value = "jasper.scrape", histogram = true)
	public Scrape getConfig(String url, String origin) {
		var providers = configs.getAllConfigs(origin, "+plugin/scrape", Scrape.class);
		for (var c : providers) {
			if (c.getSchemes() == null) continue;
			for (var s : c.getSchemes()) {
				var regex = Pattern.quote(s).replace("*", "\\E.*\\Q");
				if (url.matches(regex)) return c;
			}
		}
		return configs.getConfig("config:scrape-catchall", origin, "+plugin/scrape", Scrape.class);
	}

	@Timed(value = "jasper.scrape")
	public String rss(String url) throws IOException {
		var data = proxy.fetchString(url);
		if (!data.trim().startsWith("<")) return null;
		var doc = Jsoup.parse(data, url);
		// TODO: support application/feed+json
		return doc.getElementsByTag("link").stream()
			.filter(t -> t.attr("type").equals("application/rss+xml") || t.attr("type").equals("application/atom+xml"))
			.filter(t -> t.hasAttr("href"))
			.map(t -> t.absUrl("href"))
			.findFirst().orElse(null);
	}

	@Timed(value = "jasper.scrape")
	public Ref web(String url, String origin) throws IOException, URISyntaxException {
		var config = getConfig(url, origin);
		if (config == null) return null;
		var data = proxy.fetchString(url, origin, refRepository.existsByUrlAndOrigin(url, origin));
		if (isBlank(data) || !data.trim().startsWith("<")) return from(url, origin);
		var result = refRepository.findOneByUrlAndOrigin(url, origin).orElse(from(url, origin));
		// Update ref but don't persist changes
		em.detach(result);
		var doc = Jsoup.parse(data, url);
		result.setTitle(doc.title());
		fixImages(doc, config);
		parseImages(result, doc, config);
		parseThumbnails(result, doc, config);
		parsePublished(result, doc, config);
		removeSelectors(doc, config);
		removeStyleSelectors(doc, config);
		parseVideos(result, doc, config);
		for (var v : doc.select("video")) {
			if (v.select("source").isEmpty()) v.remove();
		}
		parseAudio(result, doc, config);
		for (var a : doc.select("audio")) {
			if (a.select("source").isEmpty()) a.remove();
		}
		parseOpenGraph(result, doc, config);
		parseOembed(result, doc, config);
		parseLinkedData(result, doc, config);
		parseText(result, doc, config);
		return result;
	}

	private void fixImages(Document doc, Scrape config) {
		var images = doc.select("img");
		for (var image : images) {
			var src = image.absUrl("src");
			var dataSrc = image.absUrl("data-src");
			if (isNotBlank(dataSrc)) {
				image.attr("src", src = dataSrc);
			}
			var dataSrcset = image.absUrl("data-srcset");
			if (isNotBlank(dataSrcset)) {
				image.attr("src", src = getImage(dataSrcset.split(",")[0]));
			}
			if (config.getImageFixRegex() == null) continue;
			for (var query : config.getImageFixRegex()) {
				if (src.matches(query)) {
					image.attr("src", src.replaceAll(query, ""));
					break;
				}
			}
		}
	}

	private void parseImages(Ref result, Document doc, Scrape config) {
		if (config.getImageSelectors() == null) return;
		for (var s : config.getImageSelectors()) {
			var image = doc.select(s).first();
			if (image == null) continue;
			if (image.tagName().equals("a")) {
				var src = image.absUrl("href");
				cacheLater(src, result.getOrigin());
				addPluginUrl(result, "plugin/image", getImage(src));
				addThumbnailUrl(result, getThumbnail(src));
			} else if (image.hasAttr("data-srcset")){
				var srcset = image.absUrl("data-srcset").split(",");
				var src = srcset[srcset.length - 1].split(" ")[0];
				cacheLater(src, result.getOrigin());
				addPluginUrl(result, "plugin/image", getImage(src));
				addThumbnailUrl(result, getThumbnail(src));
				image.parent().remove();
			} else if (image.hasAttr("srcset")){
				var srcset = image.absUrl("srcset").split(",");
				var src = srcset[srcset.length - 1].split(" ")[0];
				cacheLater(src, result.getOrigin());
				addPluginUrl(result, "plugin/image", getImage(src));
				addThumbnailUrl(result, getThumbnail(src));
				image.parent().remove();
			} else if (image.hasAttr("src")){
				var src = image.absUrl("src");
				cacheLater(src, result.getOrigin());
				addPluginUrl(result, "plugin/image", getImage(src));
				addThumbnailUrl(result, getThumbnail(src));
				image.parent().remove();
			}
		}
	}

	private void parseThumbnails(Ref result, Document doc, Scrape config) {
		if (config.getThumbnailSelectors() == null) return;
		for (var s : config.getThumbnailSelectors()) {
			for (var thumbnail : doc.select(s)) {
				if (thumbnail.tagName().equals("svg")) {
					addThumbnailUrl(result, svgToUrl(sanitizer.clean(thumbnail.outerHtml(), result.getUrl())));
				} else if (thumbnail.hasAttr("href")) {
					var src = thumbnail.absUrl("href");
					cacheLater(src, result.getOrigin());
					addThumbnailUrl(result, getThumbnail(src));
				} else if (thumbnail.hasAttr("data-srcset")){
					var srcset = thumbnail.absUrl("data-srcset").split(",");
					var src = srcset[srcset.length - 1].split(" ")[0];
					cacheLater(src, result.getOrigin());
					addThumbnailUrl(result, getThumbnail(src));
				} else if (thumbnail.hasAttr("srcset")){
					var srcset = thumbnail.absUrl("srcset").split(",");
					var src = srcset[srcset.length - 1].split(" ")[0];
					cacheLater(src, result.getOrigin());
					addThumbnailUrl(result, getThumbnail(src));
				} else if (thumbnail.hasAttr("src")){
					var src = thumbnail.absUrl("src");
					cacheLater(src, result.getOrigin());
					addThumbnailUrl(result, getThumbnail(src));
				}
				thumbnail.parent().remove();
			}
		}
	}

	private static Options opts = new Options(Pointer.PointerType.PAST);;
	private void parsePublished(Ref result, Document doc, Scrape config) {
		if (config.getPublishedSelectors() == null) return;
		for (var s : config.getPublishedSelectors()) {
			var published = doc.select(s).first();
			if (published == null) continue;
			String date = "";
			if (published.tagName().equals("time")) {
				date = published.attr("datetime");
			} else {
				result.setPublished(Instant.ofEpochSecond(Chronic.parse(published.text(), opts).getBegin()));
				return;
			}
			if (isBlank(date)) continue;
			try {
				result.setPublished(parseDate(date));
				return;
			} catch (DateTimeParseException ignored) {}
		}
	}

	private void removeSelectors(Document doc, Scrape config) {
		if (config.getRemoveSelectors() == null) return;
		for (var r : config.getRemoveSelectors()) doc.select(r).remove();
	}

	private void removeStyleSelectors(Document doc, Scrape config) {
		if (config.getRemoveStyleSelectors() == null) return;
		for (var r : config.getRemoveStyleSelectors()) doc.select(r).removeAttr("style");
	}

	private void parseVideos(Ref result, Document doc, Scrape config) {
		if (config.getVideoSelectors() == null) return;
		for (var s : config.getVideoSelectors()) {
			for (var video : doc.select(s)) {
				if (video.tagName().equals("div")) {
					var src = video.absUrl("data-stream");
					cacheLater(src, result.getOrigin());
					addVideoUrl(result, getVideo(src));
				} else if (video.hasAttr("src")) {
					var src = video.absUrl("src");
					cacheLater(src, result.getOrigin());
					addVideoUrl(result, getVideo(src));
					addWeakThumbnail(result, getThumbnail(src));
					video.parent().remove();
				}
			}
		}
	}

	private void parseAudio(Ref result, Document doc, Scrape config) {
		if (config.getAudioSelectors() == null) return;
		for (var s : config.getAudioSelectors()) {
			for (var audio : doc.select(s)) {
				if (audio.hasAttr("src")) {
					var src = audio.absUrl("src");
					cacheLater(src, result.getOrigin());
					addPluginUrl(result, "plugin/audio", getVideo(src));
					audio.parent().remove();
				}
			}
		}
	}

	private void parseOpenGraph(Ref result, Document doc, Scrape config) {
		if (!config.isOpenGraph()) return;
		for (var metaAudio : doc.select("meta[property=og:audio]")) {
			if (isBlank(metaAudio.attr("content"))) continue;
			addPluginUrl(result, "plugin/audio", metaAudio.absUrl("content"));
		}
		for (var metaVideo : doc.select("meta[property=og:video]")) {
			var videoUrl = metaVideo.absUrl("content");
			if (isBlank(videoUrl)) continue;
			// TODO: video filetypes
			if (videoUrl.endsWith(".m3u8") || videoUrl.endsWith(".mp4")) {
				addVideoUrl(result, videoUrl);
			} else {
				addPluginUrl(result, "plugin/embed", videoUrl);
			}
		}
		for (var metaImage : doc.select("meta[property=og:image]")) {
			if (isBlank(metaImage.attr("content"))) continue;
			// TODO: In some cases load plugin/image
			addThumbnailUrl(result, metaImage.absUrl("content"));
		}
		var metaTitle = doc.select("meta[property=og:title]").first();
		if (metaTitle != null && isNotBlank(metaTitle.attr("content"))) {
			result.setTitle(metaTitle.attr("content"));
		}
		var metaPublished = doc.select("meta[property=article:published_time]").first();
		if (metaPublished != null && isNotBlank(metaPublished.attr("content"))) {
			result.setPublished(parseDate(metaPublished.attr("content")));
		}
		metaPublished = doc.select("meta[property=og:article:published_time]").first();
		if (metaPublished != null && isNotBlank(metaPublished.attr("content"))) {
			result.setPublished(parseDate(metaPublished.attr("content")));
		}
		var metaReleased = doc.select("meta[property=og:book:release_date]").first();
		if (metaReleased != null && isNotBlank(metaReleased.attr("content"))) {
			result.setPublished(parseDate(metaReleased.attr("content")));
		}
	}

	private void parseOembed(Ref result, Document doc, Scrape config) {
		if (!config.isOembedJson()) return;
		var oembed = doc.select("link[type=application/json+oembed]").first();
		if (oembed != null) {
			var oembedUrl = oembed.absUrl("href");
			// TODO: embedded oembed
		}
	}

	private void parseLinkedData(Ref result, Document doc, Scrape config) {
		if (!config.isLdJson());
		var jsonlds = doc.select("script[type=application/ld+json]");
		for (var jsonld : jsonlds) {
			var json = jsonld.html().trim().replaceAll("\n", " ");
			try {
				var configs = json.startsWith("{") ?
					List.of(objectMapper.readValue(json, JsonLd.class)) :
					objectMapper.readValue(json, new TypeReference<List<JsonLd>>(){});
				for (var c : configs) parseLd(result, c);
			} catch (Exception e) {
				logger.warn("Invalid LD+JSON. {}", getMessage(e));
				logger.debug(json);
			}
		}
	}

	private void parseText(Ref result, Document doc, Scrape config) {
		if (!config.isText()) return;
		if (config.getTextSelectors() != null) {
			for (var s : config.getTextSelectors()) {
				var el = doc.body().select(s).first();
				if (el != null) {
					for (var r : config.getRemoveAfterSelectors()) el.select(r).remove();
					result.setComment(sanitizer.clean(el.html(), result.getUrl()));
					return;
				}
			}
		}
		result.setComment(doc.body()
			.wholeText()
			.trim()
			.replaceAll("\t", "")
			.replaceAll("[\n\r]", "\n\n"));
	}

	private void addWeakThumbnail(Ref ref, String url) {
		if (!ref.hasTag("plugin/thumbnail")) addThumbnailUrl(ref, url);
	}

	private void addThumbnailUrl(Ref ref, String url) {
		if (url.endsWith(".com") || url.endsWith(".m3u8")) return;
		// TODO: Fallback if image can't load
		addPluginUrl(ref, "plugin/thumbnail", url);
	}

	private void addVideoUrl(Ref ref, String url) {
		if (isBlank(url)) return;
		if (ref.hasTag("plugin/video") && url.endsWith(".m3u8")
			&& !ref.getPlugin("plugin/video", Video.class).getUrl().endsWith(".m3u8")) return;
		addPluginUrl(ref, "plugin/video", url);
	}

	private void parseLd(Ref result, JsonLd ld) {
		if (isNotBlank(ld.getThumbnailUrl())) addWeakThumbnail(result, ld.getThumbnailUrl());
		if ("NewsArticle".equals(ld.getType())) {
			if (isNotBlank(ld.getDatePublished())) {
				result.setPublished(parseDate(ld.getDatePublished()));
			}
		}
		if ("AudioObject".equals(ld.getType())) {
			if (isNotBlank(ld.getEmbedUrl())) {
				addPluginUrl(result, "plugin/embed", ld.getEmbedUrl());
			} else if (isNotBlank(ld.getUrl())) {
				addPluginUrl(result, "plugin/audio", ld.getUrl());
			}
		}
		if (("VideoObject".equals(ld.getType())) || "http://schema.org/VideoObject".equals(ld.getType())) {
			if (isNotBlank(ld.getContentUrl())) {
				addVideoUrl(result, ld.getContentUrl());
				if (isBlank(ld.getThumbnailUrl())) addThumbnailUrl(result, ld.getContentUrl());
			} else if (isNotBlank(ld.getEmbedUrl())) {
				addPluginUrl(result, "plugin/embed", ld.getEmbedUrl());
			}
		}
		if ("SocialMediaPosting".equals(ld.getType())) {
		}
		if ("ImageObject".equals(ld.getType())) {
			if (isNotBlank(ld.getEmbedUrl())) {
				addPluginUrl(result, "plugin/embed", ld.getEmbedUrl());
			} else if (isNotBlank(ld.getContentUrl())) {
				if (isBlank(ld.getThumbnailUrl())) {
					addThumbnailUrl(result, ld.getContentUrl());
				} else if (!hasMedia(result)) {
					addPluginUrl(result, "plugin/image", ld.getContentUrl());
				}
			}
		}
		if (ld.getPublisher() != null) {
			for (var p : ld.getPublisher()) {
				if (p.isTextual()) continue;
				var pub = objectMapper.convertValue(p, JsonLd.class);
				if (pub.getLogo() != null) {
					for (var icon : pub.getLogo().isArray() ? pub.getLogo() : List.of(pub.getLogo())) {
						if (icon.isObject()) {
							if (icon.has("url") && isNotBlank(icon.get("url").asText())) addWeakThumbnail(result, icon.get("url").asText());
						} if (isNotBlank(icon.asText())) {
							addWeakThumbnail(result, icon.asText());
						}
					}
				}
			}
		}
		if (ld.getImage() != null) {
			for (var image : ld.getImage()) {
				if (image.isTextual()) {
					addPluginUrl(result, "plugin/image", image.textValue());
				} else {
					parseLd(result, objectMapper.convertValue(image, JsonLd.class));
				}
			}
		}
		if (ld.getVideo() != null) {
			for (var video : ld.getVideo()) {
				if (video.isTextual()) {
					addPluginUrl(result, "plugin/video", video.textValue());
				} else {
					parseLd(result, objectMapper.convertValue(video, JsonLd.class));
				}
			}
		}
	}

	private String getVideo(String src) {
		return src;
	}

	private String getImage(String src) {
		if (src.contains("/full/max/0/")) return src.replace("/full/max/0/", "/full/!1920,1080/0/");
		if (src.contains("?resize")) return src.substring(0, src.indexOf("?resize"));
		return src;
	}

	private String getThumbnail(String src) {
		if (src.contains("/full/max/0/")) return src.replace("/full/max/0/", "/full/!300,200/0/");
		return src;
	}

	private String svgToUrl(String svg) {
		return "data:image/svg+xml," + svg
			.replaceAll("<svg", svg.contains("xmlns") ? "<svg" : "<svg xmlns='http://www.w3.org/2000/svg'")
			.replaceAll("viewbox", "viewBox")
			.replaceAll("\"", "'")
			.replaceAll("%", "%25")
			.replaceAll("#", "%23")
			.replaceAll("\\{", "%7B")
			.replaceAll("}", "%7D")
			.replaceAll("<", "%3C")
			.replaceAll(">", "%3E")
			.replaceAll("\s+"," ");
	}

	private void addPluginUrl(Ref ref, String tag, String url) {
		if (!"plugin/embed".equals(tag)) cacheLater(url, ref.getOrigin());
		ref.setPlugin(tag, Map.of("url", url));
	}

	private void cacheLater(String url, String origin) {
		if (isBlank(url)) return;
		url = fixUrl(url);
		var ref = refRepository.findOneByUrlAndOrigin(url, origin).orElse(null);
		if (ref != null && (ref.hasTag("_plugin/cache") || ref.hasTag("_plugin/delta/cache"))) return;
		tagger.internalTag(url, origin, "_plugin/delta/cache");
	}

	private String fixUrl(String url) {
		// TODO: Add plugin to override like oembeds
//		return url.replaceAll("%20", "+");
		return url.replaceAll(" ", "%20");
	}

	private Instant parseDate(String date) {
		try {
			return Instant.parse(date);
		} catch (Exception e) {
			// TODO: support other date formats
			return null;
		}
	}

}