FetchImplHttp.java

package jasper.component;

import io.github.resilience4j.bulkhead.annotation.Bulkhead;
import jasper.errors.NotFoundException;
import jasper.errors.ScrapeProtocolException;
import jasper.security.HostCheck;
import org.apache.http.HttpHeaders;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Profile;
import org.springframework.stereotype.Component;

import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;

import static jasper.domain.proj.HasTags.hasMatchingTag;
import static jasper.plugin.Pull.getPull;
import static org.apache.http.util.EntityUtils.consumeQuietly;

@Profile("proxy")
@Component
public class FetchImplHttp implements Fetch {
	private static final Logger logger = LoggerFactory.getLogger(FetchImplHttp.class);

	@Autowired
	HostCheck hostCheck;

	@Autowired
	ConfigCache configs;

	@Autowired
	HttpClientFactory httpClientFactory;

	@Autowired
	Replicator replicator;

	@Bulkhead(name = "fetch")
	public FileRequest doScrape(String url, String origin) throws IOException {
		var remote = configs.getRemote(origin);
		var pull = getPull(remote);
		if (url.startsWith("cache:") || pull.isCacheProxy()) {
			if (hasMatchingTag(remote, "+plugin/error")) return null;
			if (remote == null) {
				logger.warn("{} Can't find remote for cache {}", origin, url);
				return null;
			}
			return replicator.fetch(url, remote);
		}
		if (url.startsWith("http:") || url.startsWith("https:")) {
			return wrap(doWebScrape(url));
		}
		throw new ScrapeProtocolException(url.contains(":") ? url.substring(0, url.indexOf(":")) : "unknown");
	}

	private CloseableHttpResponse doWebScrape(String url) throws IOException {
		logger.debug("Starting request to {}", url);
		HttpUriRequest request = new HttpGet(url);
		if (!hostCheck.validHost(request.getURI())) {
			logger.info("Invalid host {}", request.getURI().getHost());
			throw new NotFoundException("Invalid host.");
		}
		request.setHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36");
		var http = httpClientFactory.getClient();
		var res = http.execute(request);
		if (res == null) return null;
		if (res.getStatusLine().getStatusCode() == 301 || res.getStatusLine().getStatusCode() == 304) {
			try {
				var location = res.getFirstHeader("Location").getElements()[0].getValue();
				logger.debug("Forwarding request to {} -> {}", url, location);
				return doWebScrape(location);
			} catch (Exception e) {
				logger.error("Error forwarding request from {}", url, e);
				return null;
			} finally {
				res.close();
			}
		}
		logger.debug("Request completed {}", url);
		return res;
	}

	private FileRequest wrap(CloseableHttpResponse res) {
		if (res == null) return null;
		return new FileRequest() {
			@Override
			public String getMimeType() {
				var header = res.getFirstHeader(HttpHeaders.CONTENT_TYPE);
				return header == null ? null : header.getValue();
			}

			@Override
			public InputStream getInputStream() throws IOException {
				return new FilterInputStream(res.getEntity().getContent()) {
					@Override
					public void close() throws IOException {
						try {
							super.close();
						} finally {
							res.close();
						}
					}
				};
			}

			@Override
			public void close() throws IOException {
				consumeQuietly(res.getEntity());
				res.close();
			}
		};
	}

}