From: Felix Dörre Date: Fri, 23 Dec 2016 10:46:53 +0000 (+0100) Subject: add: Implement use of Cisco Umbrella 1 Million domain list X-Git-Url: https://code.wpia.club/?p=gigi.git;a=commitdiff_plain;h=4adc67e31c14c5f1192eec188a78595e044d92f8 add: Implement use of Cisco Umbrella 1 Million domain list as source for high-financial-value-domains Information about the list is available here: http://s3-us-west-1.amazonaws.com/umbrella-static/index.html Blogpost about it: https://blog.opendns.com/2016/12/14/cisco-umbrella-1-million/ Change-Id: I5d8183f5dd09e3b033301cec59b3fa1e820f236c --- diff --git a/util/org/cacert/gigi/util/HighFinancialValueFetcher.java b/util/org/cacert/gigi/util/HighFinancialValueFetcher.java index 338b8ae2..abac278d 100644 --- a/util/org/cacert/gigi/util/HighFinancialValueFetcher.java +++ b/util/org/cacert/gigi/util/HighFinancialValueFetcher.java @@ -9,14 +9,36 @@ import java.net.URL; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -public class HighFinancialValueFetcher { +public abstract class HighFinancialValueFetcher { + + public final int max; + + private File f; + + private String base; + + public HighFinancialValueFetcher(File f, int max, String base) { + this.f = f; + this.max = max; + this.base = base; + } public static void main(String[] args) throws IOException { int max = 1000; if (args.length > 1) { max = Integer.parseInt(args[1]); } - try (PrintWriter fos = new PrintWriter(new File(args[0]), "UTF-8"); ZipInputStream zis = new ZipInputStream(new URL("https://s3.amazonaws.com/alexa-static/top-1m.csv.zip").openStream())) { + HighFinancialValueFetcher fetcher; + if (args.length > 2 && "--alexa".equals(args[2])) { + fetcher = new HighFinancialValueFetcherAlexa(new File(args[0]), max); + } else { + fetcher = new HighFinancialValueFetcherUmbrella(new File(args[0]), max); + } + fetcher.fetch(); + } + + public final void fetch() throws IOException { + try (PrintWriter fos = new PrintWriter(f, "UTF-8"); ZipInputStream zis = new ZipInputStream(new URL(base).openStream())) { ZipEntry ze; outer: while ((ze = zis.getNextEntry()) != null) { @@ -24,17 +46,23 @@ public class HighFinancialValueFetcher { BufferedReader br = new BufferedReader(new InputStreamReader(zis, "UTF-8")); String line; while ((line = br.readLine()) != null) { - String[] parts = line.split(","); - int i = Integer.parseInt(parts[0]); - if (i > max) { - zis.close(); + handle(line, fos); + if (entries == -1) { break outer; } - fos.println(parts[1]); - System.out.println(line); } } } } + private int entries; + + public void emit(PrintWriter fos, String value) { + fos.println(value); + if (entries == -1 || entries++ > max) { + entries = -1; + } + } + + public abstract void handle(String line, PrintWriter fos); } diff --git a/util/org/cacert/gigi/util/HighFinancialValueFetcherAlexa.java b/util/org/cacert/gigi/util/HighFinancialValueFetcherAlexa.java new file mode 100644 index 00000000..17c91825 --- /dev/null +++ b/util/org/cacert/gigi/util/HighFinancialValueFetcherAlexa.java @@ -0,0 +1,22 @@ +package org.cacert.gigi.util; + +import java.io.File; +import java.io.PrintWriter; + +public class HighFinancialValueFetcherAlexa extends HighFinancialValueFetcher { + + public HighFinancialValueFetcherAlexa(File f, int max) { + super(f, max, "https://s3.amazonaws.com/alexa-static/top-1m.csv.zip"); + } + + @Override + public void handle(String line, PrintWriter fos) { + String[] parts = line.split(","); + // Assert that the value before the "," is an integer + Integer.parseInt(parts[0]); + + emit(fos, parts[1]); + System.out.println(parts[1]); + } + +} diff --git a/util/org/cacert/gigi/util/HighFinancialValueFetcherUmbrella.java b/util/org/cacert/gigi/util/HighFinancialValueFetcherUmbrella.java new file mode 100644 index 00000000..ef96a0fe --- /dev/null +++ b/util/org/cacert/gigi/util/HighFinancialValueFetcherUmbrella.java @@ -0,0 +1,29 @@ +package org.cacert.gigi.util; + +import java.io.File; +import java.io.PrintWriter; +import java.util.HashSet; + +public class HighFinancialValueFetcherUmbrella extends HighFinancialValueFetcher { + + public HighFinancialValueFetcherUmbrella(File f, int max) { + super(f, max, "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip"); + } + + private HashSet printed = new HashSet<>(); + + @Override + public void handle(String line, PrintWriter fos) { + String[] parts = line.split(","); + // Assert that the value before the "," is an integer + Integer.parseInt(parts[0]); + + String registrablePart = PublicSuffixes.getInstance().getRegistrablePart(parts[1]); + if (registrablePart != null && printed.add(registrablePart)) { + emit(fos, registrablePart); + System.out.println(registrablePart); + } + + } + +}