GNU Parallelを使用した大規模ブロックリストの処理

GNU Parallelを使用した大規模ブロックリストの処理

これは進行中のプロジェクトであり、リストの処理を高速化するために重要な更新が行われました。ここにいるホームページ興味のある方のために!

リストの内容を生のホストに変換しlists.json、そのホストを対応する作業方法とホストの形式に一致するリストに配置して、定義されたリストを処理します。上部の定数はbuild_lists.bash各変数の定義を示しています。

スピードアップやバグを修正する方法の提案を歓迎します!また、以下からプロジェクトを実行することをお勧めします。このドッカー環境。ここに提供されているすべてのファイルを同じディレクトリに配置し、./build_lists.bashすべてのスクリプトに実行権限を付与してから実行します。 useragentを変更することもお勧めしますaria2.conf

入力する:

スクリプト:

build_lists.bash

#!/usr/bin/env bash

#shopt -s extdebug     # or --debugging
set +H +o history     # disable history features (helps avoid errors from "!" in strings)
shopt -u cmdhist      # would be enabled and have no effect otherwise
shopt -s execfail     # ensure interactive and non-interactive runtime are similar
shopt -s extglob      # enable extended pattern matching (https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html)
set -euET -o pipefail # put bash into strict mode & have it give descriptive errors
umask 055             # change all generated file perms from 755 to 700

DOWNLOADS=$(mktemp -d)
TMP=$(mktemp -p "$DOWNLOADS")
METHOD_ALLOW='ALLOW'
METHOD_BLOCK='BLOCK'
FORMAT_DOMAIN='DOMAIN'
FORMAT_CIDR4='CIDR4'
FORMAT_CIDR6='CIDR6'
FORMAT_IPV4='IPV4'
FORMAT_IPV6='IPV6'
readonly DOWNLOADS TMP METHOD_ALLOW METHOD_BLOCK FORMAT_DOMAIN FORMAT_CIDR4 FORMAT_CIDR6 FORMAT_IPV4 FORMAT_IPV6

METHODS=("$METHOD_BLOCK" "$METHOD_ALLOW")
FORMATS=("$FORMAT_DOMAIN" "$FORMAT_IPV4" "$FORMAT_IPV6" "$FORMAT_CIDR4" "$FORMAT_CIDR6")
readonly -a METHODS
readonly -a FORMATS

# https://github.com/ildar-shaimordanov/perl-utils#sponge
sponge() {
    perl -ne '
    push @lines, $_;
    END {
        open(OUT, ">$file")
        or die "sponge: cannot open $file: $!\n";
        print OUT @lines;
        close(OUT);
    }
    ' -s -- -file="$1"
}

sorted() {
    parsort -bfiu -S 100% -T "$DOWNLOADS" "$1" | sponge "$1"
    echo "[INFO] Optimized: ${1}"
}

# params: blacklist, whitelist
apply_whitelist() {
    # https://askubuntu.com/a/562352
    # send each line into the temp file as it's processed instead of keeping it in memory
    parallel --pipe -k -j+0 grep --line-buffered -Fxvf "$2" - <"$1" >>"$TMP"
    cp "$TMP" "$1"
    : >"$TMP"
    echo "[INFO] Applied whitelist to: ${1}"
}

# params: ip list, cidr whitelist
apply_cidr_whitelist() {
    if test -f "$1"; then
        sem -j+0 grepcidr -vf "$2" <"$1" | sponge "$1"
        sem --wait
        echo "[INFO] Applied CIDR whitelist to: ${1}"
    fi
}

init() {
    trap 'rm -rf "$DOWNLOADS"' EXIT || exit 1
    mkdir -p build/
    : >logs/aria2.log
    chmod -t /tmp
}

cleanup() {
    chmod +t /tmp
}

main() {
    local cache
    local list
    local blacklist
    local results

    init

    for method in "${METHODS[@]}"; do
        cache="${DOWNLOADS}/${method}"

        echo "[INFO] Processing method: ${method}"

        set +e # temporarily disable strict fail, in case downloads fail
        jq -r --arg method "$method" 'to_entries[] |
            select(.value.content.retriever == "ARIA2" and .value.method == $method) |
            {key, mirrors: .value.mirrors} |
            (.mirrors | join("\t")), " out=\(.key)"' lists.json |
            aria2c -i- -d "$cache" --conf-path='./aria2.conf'
        set -e

        echo "[INFO] Downloaded ${method} lists!"

        for format in "${FORMATS[@]}"; do
            results="${cache}/${format}"
            mkdir -p "$results"

            echo "[INFO] Sending list results to: ${results}"

            find -P -O3 "$cache" -maxdepth 1 -type f -print0 |
                # https://www.gnu.org/software/parallel/parallel_tutorial.html#controlling-the-execution
                parallel -0 --use-cpus-instead-of-cores --jobs 0 --results "$results" -X ./apply_filters.bash {} "$method" "$format"

            list="build/${method}_${format}.txt"

            echo "[INFO] Processed: ${list}"

            find -P -O3 "$results" -type f -name stdout -exec cat -s {} + | sponge "$list"

            if [ -f "$list" ] && [ -s "$list" ]; then
                sorted "$list"

                if [[ "$method" == "$METHOD_ALLOW" ]]; then
                    blacklist="build/BLOCK_${format}.txt"
                    echo "[INFO] Applying whitelist: ${list}"

                    case "$format" in
                    "$FORMAT_CIDR4")
                        apply_cidr_whitelist "$blacklist" "$list"
                        apply_cidr_whitelist "build/BLOCK_IPV4.txt" "$list"
                        ;;
                    "$FORMAT_CIDR6")
                        apply_cidr_whitelist "$blacklist" "$list"
                        apply_cidr_whitelist "build/BLOCK_IPV6.txt" "$list"
                        ;;
                    *)
                        apply_whitelist "$blacklist" "$list"
                        ;;
                    esac
                else
                    # Remove IPs from the IP blacklists that are covered by the CIDR blacklists
                    case "$format" in
                    "$FORMAT_CIDR4")
                        apply_cidr_whitelist "build/BLOCK_IPV4.txt" "$list"
                        ;;
                    "$FORMAT_CIDR6")
                        apply_cidr_whitelist "build/BLOCK_IPV6.txt" "$list"
                        ;;
                    *) ;;
                    esac
                fi

                echo "[INFO] Processed ${method} ${format} list!"
            fi
        done
    done

    # https://superuser.com/questions/191889/how-can-i-list-only-non-empty-files-using-ls
    find -P -O3 ./build/ -size 0 -type f -name "*.txt" -exec rm {} \; # remove any empty lists
    find -P -O3 ./build/ -type f -name "*.txt" -exec sha256sum {} \; | sponge './build/CHECKSUMS.txt'

    cleanup
}

# https://github.com/koalaman/shellcheck/wiki/SC2218
main

apply_filters.bash

#!/usr/bin/env bash

get_ipv4s() {
    ipinfo grepip -4hox --nocolor
}

get_ipv6s() {
    ipinfo grepip -6hox --nocolor
}

get_domains_from_urls() {
    perl -MData::Validate::Domain=is_domain -MRegexp::Common=URI -nE 'while (/$RE{URI}{HTTP}{-scheme => "https?|udp"}{-keep}/g) {say $3 if is_domain($3, { domain_private_tld => { onion => 1 } })}' 2>/dev/null
}

get_ipv4s_from_urls() {
    perl -MData::Validate::IP=is_ipv4 -MRegexp::Common=URI -nE 'while (/$RE{URI}{HTTP}{-scheme => "https?|udp"}{-keep}/g) {say $3 if is_ipv4($3)}' 2>/dev/null
}

hostsblock() {
    gawk 'BEGIN{FS="[|^]"}/^\|\|([[:alnum:]_-]{1,63}\.)+[[:alpha:]]+\^(\$third-party)?$/{print tolower($3)}'
}

# params: column number
mlr_cut_col() {
    mlr --csv --skip-comments -N clean-whitespace then cut -f "$1"
}

process_list() {
    local FILE_PATH
    local LIST_METHOD
    local CONTENT_FILTER
    local CONTENT_TYPE
    local LIST_FILTER
    local LIST_FORMAT

    FILE_PATH="$1"
    LIST_METHOD="$2"
    CONTENT_FILTER="$3"
    CONTENT_TYPE="$4"
    LIST_FILTER="$5"
    LIST_FORMAT="$6"

    case "$CONTENT_FILTER" in
    'NONE') cat -s "$FILE_PATH" ;;
    '7Z') 7za -y -so e "$FILE_PATH" ;;
    'ZIP') zcat "$FILE_PATH" ;;
    'GZIP') gzip -cd "$FILE_PATH" ;;
    'TARBALL') tar -xOzf "$FILE_PATH" ;;
    'SQUIDGUARD') tar -xOzf "$FILE_PATH" --wildcards-match-slash --wildcards '*/domains' ;;
    'SCAFROGLIA') unzip -p "$FILE_PATH" blocklists-master/*.txt ;;
    'SHADOWWHISPERER') unzip -p "$FILE_PATH" BlockLists-master/RAW/* ;;
    'ESOX_LUCIUS') unzip -p "$FILE_PATH" PiHoleblocklists-main/* -x PiHoleblocklists-main/LICENSE PiHoleblocklists-main/README.md ;;
    esac |
        case "$CONTENT_TYPE" in
        'TEXT')
            case "$LIST_FILTER" in
            'NONE') cat -s ;;
            'RAW_HOSTS_WITH_COMMENTS') mawk '/^[^[:space:]|^#|^!|^;|^$|^:]/{print $1}' ;;
            'HOSTS_FILE') ghosts -m /dev/stdin -o -p -noheader -stats=false ;;
            'ABUSE_CH_URLHAUS_DOMAIN') get_domains_from_urls ;;
            'ABUSE_CH_URLHAUS_IPV4') get_ipv4s_from_urls ;;
            'ALIENVAULT') mawk -F# '{print $1}' ;;
            'ADBLOCK') hostsblock ;;
            'GREP_IPV4') get_ipv4s ;;
            'GREP_IPV6') get_ipv6s ;;
            'BOTVIRJ_IPV4') mawk -F'|' '{print $1}' ;;
            'CRYPTOLAEMUS_DOMAIN') hxextract code /dev/stdin | head -n -1 | tail -n +6 ;;
            'CRYPTOLAEMUS_IPV4') hxextract code /dev/stdin | head -n -1 | tail -n +6 | get_ipv4s ;;
            'CYBERCRIME_DOMAIN') mawk -F/ '{print $1}' ;;
            'CYBERCRIME_IPV4') mawk -F/ '{split($1,a,":");print a[1]}' | get_ipv4s ;;
            'DATAPLANE_IPV4') mawk -F'|' '$0~/^[^#]/{gsub(/ /,""); print $3}' ;;
            'DSHIELD') mlr --tsv --skip-comments -N put '$cidr = $1 . "/" . $3' then cut -f cidr ;;
            'MYIP_DOMAIN') mawk -F, '$0~/^[^#]/{print $2}' ;;
            'MYIP_IPV4') mawk '$0~/^[^#]/{print $1}' | get_ipv4s ;;
            'MYIP_IPV6') mawk '$0~/^[^#]/{print $1}' | get_ipv6s ;;
            'VXVAULT_DOMAIN') mawk '/^[http]/' | get_domains_from_urls ;;
            'VXVAULT_IPV4') mawk '/^[http]/' | get_ipv4s_from_urls ;;
            'XFILES') tr -d "[:blank:]" | hostsblock | mawk '{print $2}' ;;
            'TRACKERSLIST') mawk '{print $1}' | get_domains_from_urls ;;
            'CHARLES_B_HALEY') mawk '$0~/^[^#]/{print $3}' ;;
            'QUANTUMULTX') mawk -F, '$1~/^HOST-SUFFIX$/{print $2}' ;;
            'QUINDECIM') mawk -F= '$0~/^=/{print $2}' | mawk '{print $1}' ;;
            'ZEEK_DOMAIN') mawk '/^[^[:space:]|^#]/&&$2~/^Intel::DOMAIN$/{print $1}' ;;
            'ZEEK_IPV4') mawk '/^[^[:space:]|^#]/&&$2~/^Intel::ADDR$/{print $1}' ;;
            esac
            ;;
        'JSON')
            case "$LIST_FILTER" in
            'ABUSE_CH_FEODOTRACKER_IPV4') jq -r '.[].ip_address' ;;
            'ABUSE_CH_FEODOTRACKER_DOMAIN') jq -r '.[] | select(.hostname != null) | .hostname' ;;
            'ABUSE_CH_THREATFOX_IPV4') jq -r 'to_entries[].value[].ioc_value | split(":")[0]' ;;
            'ABUSE_CH_THREATFOX_DOMAIN') jq -r 'to_entries[].value[].ioc_value' ;;
            'AYASHIGE') jq -r '.[].fqdn' ;;
            'CYBER_CURE_IPV4') jq -r '.data.ip[]' ;;
            'CYBERSAIYAN_DOMAIN') jq -r '.[] | select(.value.type == "URL") | .indicator' | get_domains_from_urls ;;
            'CYBERSAIYAN_IPV4') jq -r '.[] | select(.value.type == "URL") | .indicator' | get_ipv4s_from_urls ;;
            'DISCONNECTME_ENTITIES') jq -r '.entities[] | "\(.properties[])\n\(.resources[])"' ;;
            'DISCONNECTME_SERVICES') jq -r '.categories[] | to_entries[].value[] | to_entries[].value[]' ;;
            'HIPO_UNIVERSITIES') jq -r '.[].domains | join("\n")' ;;
            'ISCSANS') jq -r '.[].ipv4' ;;
            'MALSILO_DOMAIN') jq -r '.data[].network_traffic | select(.dns != null) | .dns[]' ;;
            'MALSILO_IPV4') jq -r '.data[].network_traffic | select(.tcp != null) | .tcp[] | split(":")[0]' ;;
            'MALTRAIL') jq -r '.[].ip' ;;
            'TINYCHECK_DOMAIN') jq -r '.iocs[] | select(.type == "domain") | .value' ;;
            'TINYCHECK_FREEDNS') jq -r '.iocs[] | select(.type == "freedns") | .value' ;;
            'TINYCHECK_IPV4') jq -r '.iocs[] | select(.type == "ip4addr") | .value' ;;
            'TINYCHECK_CIDR') jq -r '.iocs[] | select(.type == "cidr") | .value' ;;
            'CHONG_LUA_DAO_DOMAIN') jq -r '.[].url' | get_domains_from_urls ;;
            'CHONG_LUA_DAO_IPV4') jq -r '.[].url' | get_ipv4s_from_urls ;;
            'INQUEST_DOMAIN') jq -r '.data[] | select(.artifact_type == "domain") | .artifact' ;;
            'INQUEST_IPV4') jq -r '.data[] | select(.artifact_type == "ipaddress") | .artifact' ;;
            'CERTEGO') jq -rs '.[].links[].url' | mawk -F/ '$5~/^domain$/{print $6}' ;;
            'SECUREDROP') jq -r '.[] | .onion_address as $onion | .organization_url | split("/")[2] as $org | $org, $onion' ;;
            esac
            ;;
        'CSV')
            case "$LIST_FILTER" in
            'MLR_CUT_1') mlr_cut_col 1 ;;
            'MLR_CUT_2') mlr_cut_col 2 ;;
            'MLR_CUT_3') mlr_cut_col 3 ;;
            'MLR_CUT_4') mlr_cut_col 4 ;;
            'BENKOW_DOMAIN') mlr --csv --headerless-csv-output --ifs ';' cut -f url | get_domains_from_urls ;;
            'BENKOW_IPV4') mlr --csv --headerless-csv-output --ifs ';' cut -f url | get_ipv4s_from_urls ;;
            'BOTVIRJ_COVID') mawk 'NR>1' ;;
            'CYBER_CURE_DOMAIN_URL') tr ',' '\n' | get_domains_from_urls ;;
            'MALWARE_DISCOVERER_DOMAIN') mlr --csv --headerless-csv-output cut -f domain ;;
            'MALWARE_DISCOVERER_IPV4') mlr --csv --headerless-csv-output cut -f ip ;;
            'PHISHSTATS_DOMAIN') mlr_cut_col 3 | get_domains_from_urls ;;
            'PHISHSTATS_IPV4') mlr_cut_col 4 | get_ipv4s ;;
            'PHISHSTATS_IPV6') mlr_cut_col 4 | get_ipv6s ;;
            'TURRIS') mlr --csv --headerless-csv-output --skip-comments cut -f Address ;;
            'VIRIBACK_DOMAIN') mlr --csv --headerless-csv-output cut -f URL | get_domains_from_urls ;;
            'VIRIBACK_IPV4') mlr --csv --headerless-csv-output cut -f IP ;;
            'SHADOWSERVER_HOST') mlr --csv --headerless-csv-output cut -f http_host ;;
            'SHADOWSERVER_TARGET') mlr --csv --headerless-csv-output cut -f redirect_target ;;
            'WATCHLIST_INTERNET') mlr --csv --ifs ';' -N cut -f 1 ;;
            'CRUZ_IT') mlr --csv --headerless-csv-output clean-whitespace then cut -f ip_address ;;
            'PHISHTANK') mlr --csv --headerless-csv-output cut -f url | get_domains_from_urls ;;
            'BLOCKLIST_UA') mlr --csv --ifs ';' --headerless-csv-output cut -f IP ;;
            esac
            ;;
        'YAML')
            case "$LIST_FILTER" in
            'CRYPTOSCAMDB_BLACKLIST') yq '.[].name' ;;
            'CRYPTOSCAMDB_WHITELIST') yq '.[].url' | get_domains_from_urls ;;
            esac
            ;;
        esac | mawk 'NF && !seen[$0]++' |
        case "$LIST_FORMAT" in
        'DOMAIN')
            perl ./process_domains.pl 2>/dev/null
            ;;
        # https://metacpan.org/pod/Data::Validate::IP
        'IPV4')
            case "$LIST_METHOD" in
            'BLOCK')
                perl -MData::Validate::IP=is_public_ipv4 -nE 'chomp; if(defined($_) && is_public_ipv4($_)) {say $_;}'
                ;;
            # Ensure bogons get whitelisted
            'ALLOW')
                perl -MData::Validate::IP=is_ipv4 -nE 'chomp; if(defined($_) && is_ipv4($_)) {say $_;}'
                ;;
            esac
            ;;
        'IPV6')
            case "$LIST_METHOD" in
            'BLOCK')
                perl -MData::Validate::IP=is_public_ipv6 -nE 'chomp; if(defined($_) && is_public_ipv6($_)) {say $_;}'
                ;;
            # Ensure bogons get whitelisted
            'ALLOW')
                perl -MData::Validate::IP=is_ipv6 -nE 'chomp; if(defined($_) && is_ipv6($_)) {say $_;}'
                ;;
            esac
            ;;
        'CIDR4')
            perl ./process_cidrs.pl 2>/dev/null
            ;;
        'CIDR6')
            perl ./process_cidrs.pl 2>/dev/null
            ;;
        esac
}

main() {
    jq -r --arg key "$(basename "$1")" --arg format "$3" 'to_entries[] |
        select(.key == $key) | .value |
        .content.filter as $content_filter |
        .content.type as $content_type |
        .formats[] |
        select(.format == $format) |
        "\($content_filter)#\($content_type)#\(.filter)"' lists.json |
        while IFS='#' read -r content_filter content_type list_filter; do
            process_list "$1" "$2" "$content_filter" "$content_type" "$list_filter" "$3"
        done
}

main "$1" "$2" "$3"

process_domains.pl

#!/usr/bin/env perl

use warnings;
use strict;
use open ':std', ':encoding(UTF-8)';
use feature 'say';
use Try::Tiny;
use Text::Trim 'trim';
use Net::IDN::Encode 'domain_to_ascii';
use Data::Validate::Domain 'is_domain';

while (<>) {
  chomp;

  try {
    my $domain = domain_to_ascii(trim($_));

    if (defined($domain) && is_domain($domain, { domain_private_tld => { onion => 1 } })) {
      say($domain);
    }
  }
}

process_cidrs.pl

#!/usr/bin/env perl

use warnings;
use strict;
use open ':std', ':encoding(UTF-8)';
use feature 'say';
use Try::Tiny;
use Text::Trim 'trim';
use Net::CIDR 'cidrvalidate';

while (<>) {
  chomp;

  try {
    # https://metacpan.org/pod/Net::CIDR#$ip=Net::CIDR::cidrvalidate($ip);
    my $cidr = cidrvalidate(trim($_));
    last if !defined $cidr;
    say $cidr;
  }
}

ベストアンサー1

おすすめ記事