Added more TLD parsing support

This commit is contained in:
Robinson 2023-06-23 21:06:39 +02:00
parent 1d4f89c78e
commit 60993ddb81
No known key found for this signature in database
GPG Key ID: 8E7DB78588BD6F5C
1 changed files with 173 additions and 40 deletions

View File

@ -1,5 +1,5 @@
/* /*
* Copyright 2020 dorkbox, llc * Copyright 2023 dorkbox, llc
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -29,8 +29,22 @@ import java.net.Inet6Address
import java.net.InetAddress import java.net.InetAddress
import java.net.InetSocketAddress import java.net.InetSocketAddress
/**
* Domain types differentiated by Mozilla Public Suffix List.
*
* @since 4.5
*/
enum class DomainType {
UNKNOWN, ICANN, PRIVATE
}
data class PublicSuffixList(val type: DomainType, val rules: Set<String>, val exceptions: Set<String>, val wildcards: Set<String>) {
constructor(rules: Set<String>, exceptions: Set<String>, wildcards: Set<String>) : this(DomainType.UNKNOWN, rules, exceptions, wildcards)
}
object Dns { object Dns {
/** /**
* Gets the version number. * Gets the version number.
*/ */
@ -38,8 +52,7 @@ object Dns {
const val DEFAULT_SEARCH_DOMAIN = "" const val DEFAULT_SEARCH_DOMAIN = ""
private val exceptions = HashSet<String>() private var listTypes: MutableList<PublicSuffixList>
private val suffixes = HashSet<String>()
/** /**
* @throws IOException if the DNS resolve.conf file cannot be read * @throws IOException if the DNS resolve.conf file cannot be read
@ -129,12 +142,6 @@ object Dns {
* *
* https://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1 * https://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1
* which is... * which is...
* https://publicsuffix.org/list/effective_tld_names.dat
*
*
* also
*
*
* https://publicsuffix.org/list/public_suffix_list.dat * https://publicsuffix.org/list/public_suffix_list.dat
* *
* *
@ -145,31 +152,98 @@ object Dns {
* http://svn.apache.org/repos/asf/httpcomponents/httpclient/trunk/httpclient5/src/main/java/org/apache/hc/client5/http/psl/ * http://svn.apache.org/repos/asf/httpcomponents/httpclient/trunk/httpclient5/src/main/java/org/apache/hc/client5/http/psl/
*/ */
this.listTypes = mutableListOf()
var domainType: DomainType? = null
var exceptions: MutableSet<String>? = null
var rules: MutableSet<String>? = null
var wildcards: MutableSet<String>? = null
// now load this file into memory, so it's faster to process. // now load this file into memory, so it's faster to process.
val tldResource = Dns.javaClass.getResourceAsStream("/effective_tld_names.dat") val tldResource = Dns.javaClass.getResourceAsStream("/public_suffix_list.dat")
tldResource.bufferedReader().useLines { lines ->
tldResource?.bufferedReader()?.useLines { lines ->
lines.forEach { line -> lines.forEach { line ->
var line = line if (line.isEmpty()) {
return@forEach
}
// entire lines can also be commented using // if (line.startsWith("//")) {
if (line.isNotEmpty() && !line.startsWith("//")) { if (domainType == null) {
if (line.contains("===BEGIN ICANN DOMAINS===")) {
if (line.startsWith(".")) { domainType = DomainType.ICANN
line = line.substring(1) // A leading dot is optional } else if (line.contains("===BEGIN PRIVATE DOMAINS===")) {
} domainType = DomainType.PRIVATE
}
// An exclamation mark (!) at the start of a rule marks an exception
// to a previous wildcard rule
val isException = line.startsWith("!")
if (isException) {
line = line.substring(1)
}
if (isException) {
exceptions.add(line)
} else { } else {
suffixes.add(line) if (line.contains("===END ICANN DOMAINS===") || line.contains("===END PRIVATE DOMAINS===")) {
if (rules == null) {
rules = mutableSetOf()
}
if (exceptions == null) {
exceptions = mutableSetOf()
}
if (wildcards == null) {
wildcards = mutableSetOf()
}
listTypes.add(PublicSuffixList(domainType!!, rules!!, exceptions!!, wildcards!!))
domainType = null
rules = null
exceptions = null
wildcards = null
}
} }
//entire lines can also be commented using //
return@forEach
}
if (domainType == null) {
// only parse data from well known sections
return@forEach
}
@Suppress("NAME_SHADOWING")
var line = line
if (line.startsWith(".")) {
line = line.substring(1) // A leading dot is optional
}
// An exclamation mark (!) at the start of a rule marks an exception to a previous wildcard rule
if (line.startsWith("!")) {
// *.kawasaki.jp
//!city.kawasaki.jp
line = line.substring(1)
if (exceptions == null) {
exceptions = mutableSetOf()
}
exceptions!!.add(line)
} else if (line.startsWith("*")) {
// *.kawasaki.jp
// motors.kawasaki.jp IS A TLD
// kawasaki.jp IS NOT a TLD
// city.kawasaki.jp IS NOT a TLD (!city.kawasaki.jp is a rule)
line = line.substring(2)
if (wildcards == null) {
wildcards = mutableSetOf()
}
wildcards!!.add(line)
} else {
// this is a normal rule
if (rules == null) {
rules = mutableSetOf()
}
rules!!.add(line)
} }
} }
} }
@ -186,6 +260,7 @@ object Dns {
* *
* @return null (if there is no second level domain) or the SLD www.aa.com -> aa.com , or www.amazon.co.uk -> amazon.co.uk * @return null (if there is no second level domain) or the SLD www.aa.com -> aa.com , or www.amazon.co.uk -> amazon.co.uk
*/ */
@Suppress("NAME_SHADOWING")
fun extractSLD(domain: String): String? { fun extractSLD(domain: String): String? {
var domain = domain var domain = domain
var last = domain var last = domain
@ -244,6 +319,7 @@ object Dns {
/** /**
* Checks if the domain is a TLD. * Checks if the domain is a TLD.
*/ */
@Suppress("NAME_SHADOWING")
fun isTLD(domain: String): Boolean { fun isTLD(domain: String): Boolean {
var domain = domain var domain = domain
if (domain.startsWith(".")) { if (domain.startsWith(".")) {
@ -254,21 +330,78 @@ object Dns {
// Exceptions are ones that are not a TLD, but would match a pattern rule // Exceptions are ones that are not a TLD, but would match a pattern rule
// e.g. bl.uk is not a TLD, but the rule *.uk means it is. Hence there is an exception rule // e.g. bl.uk is not a TLD, but the rule *.uk means it is. Hence there is an exception rule
// stating that bl.uk is not a TLD. // stating that bl.uk is not a TLD.
if (exceptions.contains(domain)) { listTypes.forEach { list ->
return false // exceptions always take priority
if (list.exceptions.contains(domain)) {
// exceptions list means that this is NOT a TLD, even though it looks like one
return false
}
if (list.rules.contains(domain)) {
// we have an explicit rule for this domain
return true
}
// Try patterns....
// *.kawasaki.jp
// motors.kawasaki.jp IS A TLD
// kawasaki.jp IS NOT a TLD
// city.kawasaki.jp IS NOT a TLD (!city.kawasaki.jp is a rule)
val nextdot = domain.indexOf('.')
if (nextdot == -1) {
// there is no wildcard possibility
return false
}
return list.wildcards.contains("*" + domain.substring(nextdot))
} }
if (suffixes.contains(domain)) { return false
return true }
/**
* Checks if the domain is a TLD.
*/
@Suppress("NAME_SHADOWING")
fun getDomainType(domain: String): DomainType {
var domain = domain
if (domain.startsWith(".")) {
domain = domain.substring(1)
} }
// Try patterns. ie *.jp means that boo.jp is a TLD // An exception rule takes priority over any other matching rule.
val nextdot = domain.indexOf('.') // Exceptions are ones that are not a TLD, but would match a pattern rule
if (nextdot == -1) { // e.g. bl.uk is not a TLD, but the rule *.uk means it is. Hence there is an exception rule
return false // stating that bl.uk is not a TLD.
} listTypes.forEach { list ->
domain = "*" + domain.substring(nextdot) // exceptions always take priority
if (list.exceptions.contains(domain)) {
// exceptions list means that this is NOT a TLD, even though it looks like one
return list.type // false (This is not a TLD, and it's defined in the public suffixed list)
}
return suffixes.contains(domain) if (list.rules.contains(domain)) {
// This is a TLD, and it's defined in the public suffixed list
return list.type
}
// Try patterns....
// *.kawasaki.jp
// motors.kawasaki.jp IS A TLD
// kawasaki.jp IS NOT a TLD
// city.kawasaki.jp IS NOT a TLD (!city.kawasaki.jp is a rule)
val nextdot = domain.indexOf('.')
if (nextdot == -1) {
// not a tld, because it's not formatted correctly
return DomainType.UNKNOWN
}
val domain = "*" + domain.substring(nextdot)
if (list.wildcards.contains(domain)) {
return list.type
}
}
return DomainType.UNKNOWN
} }
} }