mwmbl/mwmbl/indexer/domains.py
milovanderlinden dfd3f3962e Fix issue #60
2022-07-10 11:10:03 +02:00

35 lines
1.1 KiB
Python

"""
Extract top domains from BigQuery result.
"""
import json
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
def get_top_domains():
data = pd.read_csv(ALL_DOMAINS_PATH, index_col='domain')
data = data[data.index.notnull()]
frequent = data[data['total'] >= MIN_COUNT]
scores = frequent['mean_score'] * np.log(frequent['total']) ** 2
median_score = np.median(scores)
print("Median score", median_score)
probabilities = scores / (scores + median_score)
top_probabilities = probabilities[probabilities > PROBABILITY_THRESHOLD]
top_probabilities.sort_values(ascending=False, inplace=True)
with open(TOP_DOMAINS_PATH, 'w') as output_file:
probabilities_str = str(top_probabilities.to_dict()).replace(', ', ',\n')
output_file.write("DOMAINS = " + probabilities_str + '\n\n')
# json.dump(probabilities.to_dict(), output_file, indent=2)
# for row in probabilities.iterrows():
# output_file.write(json.dumps(row.to_dict()) + '\n')
if __name__ == '__main__':
get_top_domains()