Automate updates for list of searx instances (#3)

* Create nightly update workflow for instances

A nightly GitHub Actions CI workflow has been added to fetch new
instances of supported services within Farside.

Currently only Searx is supported, but obviously others could be added
if there are similarly easy ways to fetch and filter instances
programmatically.

services.json has also been updated with the initial results of the
workflow script.

* Set headers for every HTTPoison request

This serves as a workaround for bot blocking via filtron.

* Expand filtering of searx instances

New filter enforces:
- No Cloudflare
- Good TLS config
- Good HTTP header config
- Vanilla instances or forks
- Instances with 100% search success
This commit is contained in:
Ben Busby 2021-11-26 09:12:46 -07:00 committed by GitHub
parent ff97d258f0
commit 8e3455a790
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 243 additions and 169 deletions

68
.github/workflows/update-instances.yml vendored Normal file
View file

@ -0,0 +1,68 @@
on:
schedule:
- cron: '0 0 * * *'
jobs:
update-instances:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Install dependencies
run: sudo apt-get install -y jq
- name: Fetch instances
run: |
function apply_update() {
mv services-tmp.json services.json
rm -f *-tmp.json
# Ensure no trailing slashes for any instance
sed -i '' 's/\/"/"/g' services.json
}
# ==============================================================
# Git config
# ==============================================================
git config --global user.name github-actions
git config --global user.email 41898282+github-actions[bot]@users.noreply.github.com
git remote set-url origin git@github.com:benbusby/farside.git
git checkout main
# ==============================================================
# Searx update
# ==============================================================
curl -s https://searx.space/data/instances.json | \
jq '[
.instances |
to_entries[] |
select(.value.network_type == "normal") |
select(.value.version | . != null) |
select(.value.version | startswith("1.0.0")) |
select(.value.network.asn_privacy == 0) |
select(.value.http.error == null) |
select(.value.tls.grade == "A+" or .value.tls.grade == "A") |
select(.value.http.grade == "A+" or .value.http.grade == "A") |
select(.value.html.grade == "V" or .value.html.grade == "F") |
.key
] | sort' > searx-tmp.json
jq --slurpfile searx searx-tmp.json \
'( .[] | select(.type == "searx") )
.instances |= $searx[0]' services.json > services-tmp.json
apply_update
# ==============================================================
# TODO: Update instances for other services
# ==============================================================
# ==============================================================
# Push changes
# ==============================================================
if [[ $(git diff-index --quiet HEAD) ]]; then
echo "No updates"
else
git add services.json
git commit -m '[CI] Auto update instances'
git push
fi

View file

@ -8,4 +8,10 @@ config :farside,
fallback_suffix: "-fallback",
previous_suffix: "-previous",
services_json: "services.json",
index: "index.eex"
index: "index.eex",
headers: [
{"User-Agent", "Mozilla/5.0 (Linux x86_64; rv:94.0) Gecko/20100101 Firefox/94.0"},
{"Accept", "text/html"},
{"Accept-Language", "en-US,en;q=0.5"},
{"Accept-Encoding", "gzip, deflate, br"}
]

View file

@ -3,6 +3,7 @@ defmodule Farside.Instances do
@update_file Application.fetch_env!(:farside, :update_file)
@services_json Application.fetch_env!(:farside, :services_json)
@service_prefix Application.fetch_env!(:farside, :service_prefix)
@headers Application.fetch_env!(:farside, :headers)
def sync() do
File.rename(@update_file, "#{@update_file}-prev")
@ -21,7 +22,7 @@ defmodule Farside.Instances do
System.get_env("FARSIDE_TEST") ->
:good
true ->
case HTTPoison.get(url) do
case HTTPoison.get(url, @headers) do
{:ok, %HTTPoison.Response{status_code: 200}} ->
# TODO: Add validation of results, not just status code
:good

View file

@ -134,36 +134,35 @@
"test_url": "/search?q=github",
"fallback": "https://searx.be",
"instances": [
"https://paulgo.io",
"https://search.asynchronousexchange.com",
"https://anon.sx",
"https://searx.be",
"https://searx.gnous.eu",
"https://xeek.com",
"https://searx.bar",
"https://sx.fedi.tech",
"https://searx.tiekoetter.com",
"https://search.disroot.org",
"https://northboot.xyz",
"https://searx.fmac.xyz",
"https://metasearch.nl",
"https://searx.nevrlands.de",
"https://search.mdosch.de",
"https://searx.rasp.fr",
"https://searx.zackptg5.com",
"https://paulgo.io",
"https://procurx.pt",
"https://searx2.zackptg5.com",
"https://searx.pwoss.org",
"https://search.076.ne.jp/searx",
"https://darmarit.org/searx",
"https://suche.uferwerk.org",
"https://searx.nakhan.ne",
"https://suche.dasnetzundich.de",
"https://search.antonkling.se",
"https://jsearch.pw",
"https://searx.hummel-web.at",
"https://s.zhaocloud.net",
"https://search.asynchronousexchange.com",
"https://search.bluelock.org",
"https://search.mdosch.de",
"https://searx.ru"
"https://searx.bar",
"https://searx.be",
"https://searx.divided-by-zero.eu",
"https://searx.fmac.xyz",
"https://searx.hummel-web.at",
"https://searx.nevrlands.de",
"https://searx.prvcy.eu",
"https://searx.rasp.fr",
"https://searx.ru",
"https://searx.silkky.cloud",
"https://searx.sp-codes.de",
"https://searx.stuehieyr.com",
"https://searx.theanonymouse.xyz",
"https://searx.tiekoetter.com",
"https://searx.tux.land",
"https://searx.tuxcloud.net",
"https://searx.webheberg.info",
"https://searx.xyz",
"https://searx2.zackptg5.com",
"https://swag.pw",
"https://sx.fedi.tech"
]
}
]