Use blacklist on initialisation, add tests

This commit is contained in:
Daoud Clarke 2023-10-17 21:51:23 +01:00
parent ce844b59ae
commit 8c7ddda7d9
2 changed files with 24 additions and 1 deletions

View File

@ -46,10 +46,11 @@ class URLQueue:
def initialize(self):
logger.info(f"Initializing URL queue")
blacklist_domains = get_blacklist_domains()
with Database() as db:
url_db = URLDatabase(db.connection)
found_urls = url_db.get_urls(URLStatus.NEW, INITIALIZE_URLS)
self._process_found_urls(found_urls)
self._process_found_urls(found_urls, blacklist_domains)
logger.info(f"Initialized URL queue with {len(found_urls)} urls, current queue size: {self.num_queued_batches}")
def update(self):

22
test/test_blacklist.py Normal file
View File

@ -0,0 +1,22 @@
from mwmbl.indexer.blacklist import is_domain_blacklisted
def test_blacklist_excludes_bad_pattern():
bad_domains = [
"brofqpxj.uelinc.com",
"gwaspsag.enflightmultisport.com",
"fmcqgzvk.onlinejobs2day.com",
"btmjmhyj.universityslandown.com",
"djqfctsq.ropman.com",
]
for domain in bad_domains:
assert is_domain_blacklisted(domain, set())
def test_blacklist_allows_top_domains():
assert not is_domain_blacklisted("teamblog.supportbee.com", set())
def test_blacklist_allows_other_domains():
assert not is_domain_blacklisted("something.com", set())