HtmlImporterResource.java 12 KB
/*
 * Decompiled with CFR 0_118.
 * 
 * Could not load the following classes:
 *  org.apache.commons.lang.StringUtils
 */
package com.day.cq.wcm.siteimporter.internal.resource;

import com.day.cq.wcm.siteimporter.ImporterContext;
import com.day.cq.wcm.siteimporter.internal.resource.CssImporterResource;
import com.day.cq.wcm.siteimporter.internal.resource.ImporterResource;
import com.day.cq.wcm.siteimporter.internal.resource.ImporterResourceVisitor;
import com.day.cq.wcm.siteimporter.internal.resource.ReferenceLocation;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;

public class HtmlImporterResource
extends ImporterResource {
    private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile("(?:<meta.*\\s*http-equiv\\s*=\\s*(?:\"|')Content-Type(?:\"|').*\\s*content=(?:\"|'))([a-zA-z_\\-/;\\s=]*)(\"|'.*>)", 2);
    protected ArrayList<ExtractableTag> tags = new ArrayList();
    private String basePath = null;
    private String unencodedContent;

    public HtmlImporterResource(URL location, ImporterContext ctx) {
        super(location, ctx);
        this.tags.add(new ExtractableTag("img", "src", 3));
        this.tags.add(new ExtractableTag("img", "url", 3));
        this.tags.add(new ExtractableTag("link", "href", 1));
        this.tags.add(new ExtractableTag("script", "src", 2));
        this.tags.add(new ExtractableTag("a", "href", 0));
        this.tags.add(new ExtractableTag("embed", "src", 3));
        this.tags.add(new ExtractableTag("param", "value", 3, "movie"));
        this.tags.add(new ExtractableTag("input", "src", 3, "image"));
    }

    @Override
    public Object triggerAction(ImporterResourceVisitor is) {
        return is.performHtml(this);
    }

    @Override
    public ArrayList<ReferenceLocation> getExternalReferences() {
        ArrayList<ReferenceLocation> result = new ArrayList<ReferenceLocation>();
        result.addAll(this.getHtmlReferences());
        result.addAll(this.getCssReferences());
        return result;
    }

    protected ArrayList<ReferenceLocation> getHtmlReferences() {
        ArrayList<ReferenceLocation> results = new ArrayList<ReferenceLocation>();
        Iterator<ExtractableTag> tagIt = this.tags.iterator();
        String content = "";
        try {
            content = this.getContent();
        }
        catch (IOException e) {
            this.ctx.error("Error while extracting HTML references from " + this.location.toExternalForm(), e);
        }
        while (tagIt.hasNext()) {
            ExtractableTag tag = tagIt.next();
            results.addAll(tag.extractAll(content));
        }
        return results;
    }

    protected ArrayList<ReferenceLocation> getCssReferences() {
        ArrayList<ReferenceLocation> result = new ArrayList<ReferenceLocation>();
        String content = "";
        try {
            content = this.getContent();
        }
        catch (IOException e) {
            this.ctx.error("Error while extracting CSS references from " + this.location.toExternalForm(), e);
        }
        result.addAll(this.getStyleTags(content));
        result.addAll(this.getStyleAttributes(content));
        return result;
    }

    protected ArrayList<ReferenceLocation> getStyleTags(String content) {
        int currentPos = 0;
        ArrayList<ReferenceLocation> result = new ArrayList<ReferenceLocation>();
        while (content.toLowerCase().indexOf("<style", currentPos) > -1) {
            String css;
            int start = content.toLowerCase().indexOf("<style", currentPos);
            int close = content.indexOf(">", start);
            int end = content.toLowerCase().indexOf("</style>", start);
            if (end > close && close != -1 && !"".equals(css = content.substring(close, end))) {
                CssImporterResource imp = new CssImporterResource(this.location, this.getContext());
                imp.setContent(css);
                ArrayList<ReferenceLocation> locals = imp.getExternalReferences();
                for (ReferenceLocation ref : locals) {
                    ref.setStart(ref.getStart() + close);
                    ref.setEnd(ref.getEnd() + close);
                }
                result.addAll(locals);
            }
            currentPos = Math.max(Math.max(currentPos, start), Math.max(close, end)) + 1;
        }
        return result;
    }

    @Override
    public String getBasePath() {
        if (this.basePath == null) {
            this.basePath = this.location.getPath();
            ExtractableTag base = new ExtractableTag("base", "href", 0);
            try {
                String content = this.getContent();
                ArrayList<ReferenceLocation> list = base.extractAll(content);
                if (list.size() > 0) {
                    this.basePath = list.get(list.size() - 1).getLink().getPath();
                }
            }
            catch (IOException e) {
                // empty catch block
            }
        }
        return this.basePath;
    }

    protected ArrayList<ReferenceLocation> getStyleAttributes(String content) {
        ArrayList<ReferenceLocation> result = new ArrayList<ReferenceLocation>();
        int currPos = content.toLowerCase().indexOf("style");
        while (currPos != -1) {
            if (content.lastIndexOf("<", currPos += 5) > content.lastIndexOf(">", currPos) && content.indexOf(">", currPos) > -1) {
                String shortContent = content.substring(currPos, content.indexOf(">", currPos));
                CssImporterResource imp = new CssImporterResource(this.location, this.getContext());
                imp.setContent(shortContent);
                ArrayList<ReferenceLocation> locals = imp.getExternalReferences();
                for (ReferenceLocation ref : locals) {
                    ref.setStart(ref.getStart() + currPos);
                    ref.setEnd(ref.getEnd() + currPos);
                }
                result.addAll(locals);
            }
            currPos = content.toLowerCase().indexOf("style", currPos);
        }
        return result;
    }

    @Override
    String encodeContent(ByteArrayOutputStream out) throws IOException {
        this.content = new String(out.toByteArray());
        String charset = this.getCharset();
        if (charset != null) {
            this.content = new String(out.toByteArray(), charset);
        }
        return this.content;
    }

    private String getCharset(String content) {
        return null;
    }

    @Override
    String getContentType() throws IOException {
        if (this.contentType == null && StringUtils.isNotBlank((String)this.content)) {
            Matcher matcher = CONTENT_TYPE_PATTERN.matcher(this.content);
            if (matcher.find() && matcher.groupCount() > 1) {
                this.contentType = matcher.group(1);
            }
            this.unencodedContent = null;
        }
        return null != this.contentType ? this.contentType : super.getContentType();
    }

    private class ExtractableTag {
        private String tagName;
        private String linkProperty;
        private int type;
        private String constraint;

        public ExtractableTag(String tagName, String linkProperty, int type) {
            this(tagName, linkProperty, type, "");
        }

        public ExtractableTag(String tagName, String linkProperty, int type, String constraint) {
            this.tagName = tagName;
            this.linkProperty = linkProperty;
            this.type = type;
            this.constraint = constraint;
        }

        public ArrayList<ReferenceLocation> extractAll(String html) {
            ArrayList<ReferenceLocation> locs = new ArrayList<ReferenceLocation>();
            int currentPos = 0;
            while (this.nextTagPosition(html, currentPos) > currentPos) {
                ReferenceLocation next = this.extract(html, currentPos);
                if (next != null) {
                    locs.add(next);
                    currentPos = next.getEnd();
                    continue;
                }
                if ((currentPos = this.nextTagPosition(html, currentPos) + 1) != 0) continue;
                currentPos = html.toLowerCase().lastIndexOf("<" + this.tagName) + 1;
            }
            return locs;
        }

        private int nextTagPosition(String source, int position) {
            if ("*".equals(this.tagName)) {
                int nextTag = source.indexOf("<", position);
                return source.indexOf(" ", nextTag);
            }
            return source.toLowerCase().indexOf("<" + this.tagName + " ", position);
        }

        private ReferenceLocation extract(String html, int position) {
            int start;
            int next = this.nextTagPosition(html, position);
            if (next == -1) {
                return null;
            }
            int terminal = html.indexOf(">", next);
            if (!("".equals(this.constraint) || html.toLowerCase().indexOf(this.constraint, next) != -1 && html.toLowerCase().indexOf(this.constraint, next) <= terminal)) {
                return null;
            }
            next = html.toLowerCase().indexOf(this.linkProperty, next);
            if (next > terminal || next == -1) {
                return null;
            }
            int end = start = next + this.linkProperty.length();
            int pos = start;
            String link = "";
            int state = 0;
            while (state != 5) {
                char nextChar = html.charAt(pos);
                switch (state) {
                    case 0: {
                        if (nextChar != '=') break;
                        state = 1;
                        break;
                    }
                    case 1: {
                        if (nextChar == '\"') {
                            state = 3;
                            start = pos + 1;
                            break;
                        }
                        if (nextChar == '\'') {
                            state = 4;
                            start = pos + 1;
                            break;
                        }
                        if (Character.isWhitespace(nextChar)) break;
                        link = link + nextChar;
                        state = 2;
                        start = pos;
                        break;
                    }
                    case 2: {
                        if (Character.isWhitespace(nextChar) || nextChar == '>') {
                            end = pos;
                            state = 5;
                            break;
                        }
                        link = link + nextChar;
                        break;
                    }
                    case 3: {
                        if (nextChar == '\"') {
                            end = pos;
                            state = 5;
                            break;
                        }
                        link = link + nextChar;
                        break;
                    }
                    case 4: {
                        if (nextChar == '\'') {
                            end = pos;
                            state = 5;
                            break;
                        }
                        link = link + nextChar;
                    }
                }
                if (++pos <= html.length()) continue;
                return null;
            }
            try {
                URL url = HtmlImporterResource.this.toAbsoluteReference(link);
                if (url == null) {
                    return null;
                }
                return new ReferenceLocation(start, end, url, this.getType(link));
            }
            catch (MalformedURLException e) {
                return null;
            }
        }

        protected int getType(String link) {
            if ("link".equals(this.tagName)) {
                if (link.endsWith("css")) {
                    return 1;
                }
                return 3;
            }
            return this.type;
        }
    }

}