import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
-public class HighFinancialValueFetcher {
+public abstract class HighFinancialValueFetcher {
+
+ public final int max;
+
+ private File f;
+
+ private String base;
+
+ public HighFinancialValueFetcher(File f, int max, String base) {
+ this.f = f;
+ this.max = max;
+ this.base = base;
+ }
public static void main(String[] args) throws IOException {
int max = 1000;
if (args.length > 1) {
max = Integer.parseInt(args[1]);
}
- try (PrintWriter fos = new PrintWriter(new File(args[0]), "UTF-8"); ZipInputStream zis = new ZipInputStream(new URL("https://s3.amazonaws.com/alexa-static/top-1m.csv.zip").openStream())) {
+ HighFinancialValueFetcher fetcher;
+ if (args.length > 2 && "--alexa".equals(args[2])) {
+ fetcher = new HighFinancialValueFetcherAlexa(new File(args[0]), max);
+ } else {
+ fetcher = new HighFinancialValueFetcherUmbrella(new File(args[0]), max);
+ }
+ fetcher.fetch();
+ }
+
+ public final void fetch() throws IOException {
+ try (PrintWriter fos = new PrintWriter(f, "UTF-8"); ZipInputStream zis = new ZipInputStream(new URL(base).openStream())) {
ZipEntry ze;
outer:
while ((ze = zis.getNextEntry()) != null) {
BufferedReader br = new BufferedReader(new InputStreamReader(zis, "UTF-8"));
String line;
while ((line = br.readLine()) != null) {
- String[] parts = line.split(",");
- int i = Integer.parseInt(parts[0]);
- if (i > max) {
- zis.close();
+ handle(line, fos);
+ if (entries == -1) {
break outer;
}
- fos.println(parts[1]);
- System.out.println(line);
}
}
}
}
+ private int entries;
+
+ public void emit(PrintWriter fos, String value) {
+ fos.println(value);
+ if (entries == -1 || entries++ > max) {
+ entries = -1;
+ }
+ }
+
+ public abstract void handle(String line, PrintWriter fos);
}
--- /dev/null
+package org.cacert.gigi.util;
+
+import java.io.File;
+import java.io.PrintWriter;
+
+public class HighFinancialValueFetcherAlexa extends HighFinancialValueFetcher {
+
+ public HighFinancialValueFetcherAlexa(File f, int max) {
+ super(f, max, "https://s3.amazonaws.com/alexa-static/top-1m.csv.zip");
+ }
+
+ @Override
+ public void handle(String line, PrintWriter fos) {
+ String[] parts = line.split(",");
+ // Assert that the value before the "," is an integer
+ Integer.parseInt(parts[0]);
+
+ emit(fos, parts[1]);
+ System.out.println(parts[1]);
+ }
+
+}
--- /dev/null
+package org.cacert.gigi.util;
+
+import java.io.File;
+import java.io.PrintWriter;
+import java.util.HashSet;
+
+public class HighFinancialValueFetcherUmbrella extends HighFinancialValueFetcher {
+
+ public HighFinancialValueFetcherUmbrella(File f, int max) {
+ super(f, max, "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip");
+ }
+
+ private HashSet<String> printed = new HashSet<>();
+
+ @Override
+ public void handle(String line, PrintWriter fos) {
+ String[] parts = line.split(",");
+ // Assert that the value before the "," is an integer
+ Integer.parseInt(parts[0]);
+
+ String registrablePart = PublicSuffixes.getInstance().getRegistrablePart(parts[1]);
+ if (registrablePart != null && printed.add(registrablePart)) {
+ emit(fos, registrablePart);
+ System.out.println(registrablePart);
+ }
+
+ }
+
+}