]> WPIA git - gigi.git/commitdiff
add: Implement use of Cisco Umbrella 1 Million domain list
authorFelix Dörre <felix@dogcraft.de>
Fri, 23 Dec 2016 10:46:53 +0000 (11:46 +0100)
committerFelix Dörre <felix@dogcraft.de>
Tue, 3 Jan 2017 11:03:59 +0000 (12:03 +0100)
as source for high-financial-value-domains
Information about the list is available here:
http://s3-us-west-1.amazonaws.com/umbrella-static/index.html

Blogpost about it:
https://blog.opendns.com/2016/12/14/cisco-umbrella-1-million/

Change-Id: I5d8183f5dd09e3b033301cec59b3fa1e820f236c

util/org/cacert/gigi/util/HighFinancialValueFetcher.java
util/org/cacert/gigi/util/HighFinancialValueFetcherAlexa.java [new file with mode: 0644]
util/org/cacert/gigi/util/HighFinancialValueFetcherUmbrella.java [new file with mode: 0644]

index 338b8ae2e324c68285c7c81dcc74dafeba221295..abac278dbd9ea05b5fd54e7cea067bb0696d18d5 100644 (file)
@@ -9,14 +9,36 @@ import java.net.URL;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 
-public class HighFinancialValueFetcher {
+public abstract class HighFinancialValueFetcher {
+
+    public final int max;
+
+    private File f;
+
+    private String base;
+
+    public HighFinancialValueFetcher(File f, int max, String base) {
+        this.f = f;
+        this.max = max;
+        this.base = base;
+    }
 
     public static void main(String[] args) throws IOException {
         int max = 1000;
         if (args.length > 1) {
             max = Integer.parseInt(args[1]);
         }
-        try (PrintWriter fos = new PrintWriter(new File(args[0]), "UTF-8"); ZipInputStream zis = new ZipInputStream(new URL("https://s3.amazonaws.com/alexa-static/top-1m.csv.zip").openStream())) {
+        HighFinancialValueFetcher fetcher;
+        if (args.length > 2 && "--alexa".equals(args[2])) {
+            fetcher = new HighFinancialValueFetcherAlexa(new File(args[0]), max);
+        } else {
+            fetcher = new HighFinancialValueFetcherUmbrella(new File(args[0]), max);
+        }
+        fetcher.fetch();
+    }
+
+    public final void fetch() throws IOException {
+        try (PrintWriter fos = new PrintWriter(f, "UTF-8"); ZipInputStream zis = new ZipInputStream(new URL(base).openStream())) {
             ZipEntry ze;
             outer:
             while ((ze = zis.getNextEntry()) != null) {
@@ -24,17 +46,23 @@ public class HighFinancialValueFetcher {
                 BufferedReader br = new BufferedReader(new InputStreamReader(zis, "UTF-8"));
                 String line;
                 while ((line = br.readLine()) != null) {
-                    String[] parts = line.split(",");
-                    int i = Integer.parseInt(parts[0]);
-                    if (i > max) {
-                        zis.close();
+                    handle(line, fos);
+                    if (entries == -1) {
                         break outer;
                     }
-                    fos.println(parts[1]);
-                    System.out.println(line);
                 }
             }
         }
     }
 
+    private int entries;
+
+    public void emit(PrintWriter fos, String value) {
+        fos.println(value);
+        if (entries == -1 || entries++ > max) {
+            entries = -1;
+        }
+    }
+
+    public abstract void handle(String line, PrintWriter fos);
 }
diff --git a/util/org/cacert/gigi/util/HighFinancialValueFetcherAlexa.java b/util/org/cacert/gigi/util/HighFinancialValueFetcherAlexa.java
new file mode 100644 (file)
index 0000000..17c9182
--- /dev/null
@@ -0,0 +1,22 @@
+package org.cacert.gigi.util;
+
+import java.io.File;
+import java.io.PrintWriter;
+
+public class HighFinancialValueFetcherAlexa extends HighFinancialValueFetcher {
+
+    public HighFinancialValueFetcherAlexa(File f, int max) {
+        super(f, max, "https://s3.amazonaws.com/alexa-static/top-1m.csv.zip");
+    }
+
+    @Override
+    public void handle(String line, PrintWriter fos) {
+        String[] parts = line.split(",");
+        // Assert that the value before the "," is an integer
+        Integer.parseInt(parts[0]);
+
+        emit(fos, parts[1]);
+        System.out.println(parts[1]);
+    }
+
+}
diff --git a/util/org/cacert/gigi/util/HighFinancialValueFetcherUmbrella.java b/util/org/cacert/gigi/util/HighFinancialValueFetcherUmbrella.java
new file mode 100644 (file)
index 0000000..ef96a0f
--- /dev/null
@@ -0,0 +1,29 @@
+package org.cacert.gigi.util;
+
+import java.io.File;
+import java.io.PrintWriter;
+import java.util.HashSet;
+
+public class HighFinancialValueFetcherUmbrella extends HighFinancialValueFetcher {
+
+    public HighFinancialValueFetcherUmbrella(File f, int max) {
+        super(f, max, "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip");
+    }
+
+    private HashSet<String> printed = new HashSet<>();
+
+    @Override
+    public void handle(String line, PrintWriter fos) {
+        String[] parts = line.split(",");
+        // Assert that the value before the "," is an integer
+        Integer.parseInt(parts[0]);
+
+        String registrablePart = PublicSuffixes.getInstance().getRegistrablePart(parts[1]);
+        if (registrablePart != null && printed.add(registrablePart)) {
+            emit(fos, registrablePart);
+            System.out.println(registrablePart);
+        }
+
+    }
+
+}