Use new server

This commit is contained in:
Daoud Clarke 2022-06-09 22:24:54 +01:00
parent aaca8b2b6e
commit 14107acc75

View file

@ -12,21 +12,44 @@ import requests
from mwmbl.indexer.paths import CRAWL_GLOB
API_ENDPOINT = "http://localhost:8080/batches/historical"
API_ENDPOINT = "http://95.216.215.29/batches/historical"
def total_num_batches():
return len(glob.glob(CRAWL_GLOB))
def get_batches():
for path in glob.glob(CRAWL_GLOB):
for path in sorted(glob.glob(CRAWL_GLOB)):
hashed_batch = json.load(gzip.open(path))
yield hashed_batch
def convert_item(item):
return {
'url': item['url'],
'status': 200,
'timestamp': item['timestamp'],
'content': {
'title': item['title'],
'extract': item['extract'],
'links': item['links'],
}
}
def run():
total_batches = total_num_batches()
batches = get_batches()
for hashed_batch in batches:
print("Recording batch", hashed_batch)
response = requests.post(API_ENDPOINT, json=hashed_batch)
print("Response", response)
for i, hashed_batch in enumerate(batches):
new_batch = {
'user_id_hash': hashed_batch['user_id_hash'],
'timestamp': hashed_batch['timestamp'],
'items': [convert_item(item) for item in hashed_batch['items']]
}
response = requests.post(API_ENDPOINT, json=new_batch)
print(f"Response {i} of {total_batches}", response)
if __name__ == '__main__':