package org.apache.any23.extractor.rdf;

import com.fasterxml.jackson.core.JsonProcessingException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Pattern;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.IssueReport;
import org.apache.any23.extractor.html.JsoupUtils;
import org.apache.any23.extractor.rdfa.RDFa11Parser;
import org.apache.any23.writer.JSONLDWriterFactory;
import org.eclipse.rdf4j.common.net.ParsedIRI;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFParser;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeFilter;
import org.jsoup.select.NodeTraversor;

/* loaded from: input_file:org/apache/any23/extractor/rdf/BaseRDFExtractor.class */
public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
    private boolean verifyDataType;
    private boolean stopAtFirstError;
    private static final Pattern invalidXMLCharacters = Pattern.compile("[^\t\r\n -\ud7ff\ue000-���-��]");

    public BaseRDFExtractor() {
        this(false, false);
    }

    public BaseRDFExtractor(boolean z, boolean z2) {
        this.verifyDataType = z;
        this.stopAtFirstError = z2;
    }

    protected abstract RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult);

    public boolean isVerifyDataType() {
        return this.verifyDataType;
    }

    public void setVerifyDataType(boolean z) {
        this.verifyDataType = z;
    }

    public boolean isStopAtFirstError() {
        return this.stopAtFirstError;
    }

    public void setStopAtFirstError(boolean z) {
        this.stopAtFirstError = z;
    }

    public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream inputStream, ExtractionResult extractionResult) throws IOException, ExtractionException {
        try {
            RDFParser parser = getParser(extractionContext, extractionResult);
            RDFFormat rDFFormat = parser.getRDFFormat();
            final String stringValue = extractionContext.getDocumentIRI().stringValue();
            if (rDFFormat.hasFileExtension("xhtml") || rDFFormat.hasMIMEType("application/xhtml+xml")) {
                Charset charset = rDFFormat.getCharset();
                if (charset == null) {
                    charset = StandardCharsets.UTF_8;
                }
                Document parse = JsoupUtils.parse(inputStream, stringValue, null);
                parse.outputSettings().prettyPrint(false).syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml).charset(charset);
                NodeTraversor.filter(new NodeFilter() { // from class: org.apache.any23.extractor.rdf.BaseRDFExtractor.1
                    final HashSet<String> tmpAttributeKeys = new HashSet<>();

                    public NodeFilter.FilterResult head(Node node, int i) {
                        String parsedIRI;
                        if (!(node instanceof Element)) {
                            return ((node instanceof DataNode) || (node instanceof Comment) || (node instanceof DocumentType)) ? NodeFilter.FilterResult.REMOVE : NodeFilter.FilterResult.CONTINUE;
                        }
                        HashSet<String> hashSet = this.tmpAttributeKeys;
                        Iterator it = node.attributes().iterator();
                        while (it.hasNext()) {
                            Attribute attribute = (Attribute) it.next();
                            String key = attribute.getKey();
                            String replaceAll = key.replaceAll("[^-a-zA-Z0-9_:.]", "");
                            int lastIndexOf = replaceAll.lastIndexOf(58) + 1;
                            String lowerCase = replaceAll.substring(0, lastIndexOf).toLowerCase();
                            String str = (lowerCase.startsWith("xml") ? lowerCase : "") + replaceAll.substring(lastIndexOf);
                            if (!str.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") || str.startsWith("xmlns:xml") || !hashSet.add(str)) {
                                it.remove();
                            } else if (!str.equals(key)) {
                                attribute.setKey(str);
                            }
                        }
                        hashSet.clear();
                        String replaceAll2 = ((Element) node).tagName().replaceAll("[^-a-zA-Z0-9_:.]", "");
                        String substring = replaceAll2.substring(replaceAll2.lastIndexOf(58) + 1);
                        ((Element) node).tagName(substring.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") ? substring : "div");
                        if ("base".equalsIgnoreCase(substring) && node.hasAttr(RDFa11Parser.HREF_ATTRIBUTE)) {
                            String attr = node.attr(RDFa11Parser.HREF_ATTRIBUTE);
                            try {
                                ParsedIRI create = ParsedIRI.create(attr.trim());
                                if (create.isAbsolute()) {
                                    parsedIRI = create.toString();
                                } else {
                                    ParsedIRI resolve = ParsedIRI.create(stringValue.trim()).resolve(create);
                                    if (resolve.isAbsolute()) {
                                        parsedIRI = resolve.toString();
                                    }
                                }
                                if (!parsedIRI.equals(attr)) {
                                    node.attr(RDFa11Parser.HREF_ATTRIBUTE, parsedIRI);
                                }
                            } catch (RuntimeException e) {
                            }
                        }
                        return NodeFilter.FilterResult.CONTINUE;
                    }

                    public NodeFilter.FilterResult tail(Node node, int i) {
                        return NodeFilter.FilterResult.CONTINUE;
                    }
                }, parse);
                inputStream = new ByteArrayInputStream(invalidXMLCharacters.matcher(parse.toString()).replaceAll("").getBytes(charset));
            } else if (rDFFormat.hasFileExtension(JSONLDWriterFactory.IDENTIFIER) || rDFFormat.hasMIMEType("application/ld+json")) {
                inputStream = new JsonCleaningInputStream(inputStream);
            }
            parser.parse(inputStream, stringValue);
        } catch (Exception e) {
            JsonProcessingException cause = e.getCause();
            if (!(cause instanceof JsonProcessingException)) {
                extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, toString(e), -1L, -1L);
                return;
            }
            JsonProcessingException jsonProcessingException = cause;
            if (jsonProcessingException.getLocation() == null) {
                extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, jsonProcessingException.getOriginalMessage(), -1L, -1L);
            } else {
                extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, jsonProcessingException.getOriginalMessage(), r0.getLineNr(), r0.getColumnNr());
            }
        }
    }

    private static String toString(Throwable th) {
        StringWriter stringWriter = new StringWriter();
        PrintWriter printWriter = new PrintWriter(stringWriter);
        Throwable th2 = null;
        try {
            th.printStackTrace(printWriter);
            if (printWriter != null) {
                if (0 != 0) {
                    try {
                        printWriter.close();
                    } catch (Throwable th3) {
                        th2.addSuppressed(th3);
                    }
                } else {
                    printWriter.close();
                }
            }
            String stringWriter2 = stringWriter.toString();
            return stringWriter2.length() > 1024 ? stringWriter2.substring(0, 1021) + "..." : stringWriter2;
        } catch (Throwable th4) {
            if (printWriter != null) {
                if (0 != 0) {
                    try {
                        printWriter.close();
                    } catch (Throwable th5) {
                        th2.addSuppressed(th5);
                    }
                } else {
                    printWriter.close();
                }
            }
            throw th4;
        }
    }
}
