/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parse.zip;

import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.net.URL;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.tika.Tika;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ZipTextExtractor {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private Configuration conf;

    public ZipTextExtractor(Configuration conf) {
        this.conf = conf;
    }

    public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
        ZipEntry entry;
        Object resultText = "";
        ZipInputStream zin = new ZipInputStream(input);
        while ((entry = zin.getNextEntry()) != null) {
            if (entry.isDirectory()) continue;
            int size = (int)entry.getSize();
            byte[] b = new byte[size];
            for (int x = 0; x < size; ++x) {
                int err = zin.read();
                if (err == -1) continue;
                b[x] = (byte)err;
            }
            String newurl = url + "/";
            String fname = entry.getName();
            newurl = newurl + fname;
            URL aURL = new URL(newurl);
            String base = aURL.toString();
            int i = fname.lastIndexOf(46);
            if (i == -1) continue;
            Tika tika = new Tika();
            String contentType = tika.detect(fname);
            try {
                Metadata metadata = new Metadata();
                metadata.set("Content-Length", Long.toString(entry.getSize()));
                metadata.set("Content-Type", contentType);
                Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
                Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
                ParseData theParseData = parse.getData();
                Outlink[] theOutlinks = theParseData.getOutlinks();
                for (int count = 0; count < theOutlinks.length; ++count) {
                    outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
                }
                resultText = (String)resultText + entry.getName() + " " + parse.getText() + " ";
            }
            catch (ParseException e) {
                LOG.info("fetch okay, but can't parse {}, reason: {}", (Object)fname, (Object)e.getMessage());
            }
        }
        return resultText;
    }
}

