Implement curation API using Django Ninja

This commit is contained in:
Daoud Clarke 2023-10-25 16:39:42 +01:00
parent bd017079d5
commit bb9e6aa4bd
10 changed files with 227 additions and 199 deletions

8
mwmbl/admin.py Normal file
View file

@ -0,0 +1,8 @@
from django.contrib.admin import ModelAdmin
from django.contrib.auth.admin import UserAdmin
from django.contrib import admin
from mwmbl.models import MwmblUser, UserCuration
admin.site.register(MwmblUser, UserAdmin)
admin.site.register(UserCuration, ModelAdmin)

View file

@ -7,6 +7,7 @@ from ninja import NinjaAPI
import mwmbl.crawler.app as crawler import mwmbl.crawler.app as crawler
from mwmbl.indexer.batch_cache import BatchCache from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
from mwmbl.platform import curate
from mwmbl.tinysearchengine import search from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tinysearchengine.indexer import TinyIndex, Document
@ -24,13 +25,17 @@ batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
def create_api(version): def create_api(version):
api = NinjaAPI(version=version) # Set csrf to True to all cookie-based authentication
api = NinjaAPI(version=version, csrf=True)
search_router = search.create_router(ranker) search_router = search.create_router(ranker)
api.add_router("/search/", search_router) api.add_router("/search/", search_router)
crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches) crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
api.add_router("/crawler/", crawler_router) api.add_router("/crawler/", crawler_router)
curation_router = curate.create_router(index_path)
api.add_router("/curation/", curation_router)
return api return api

View file

@ -6,19 +6,20 @@ from pathlib import Path
from django.apps import AppConfig from django.apps import AppConfig
from django.conf import settings from django.conf import settings
from mwmbl.api import queued_batches
from mwmbl import background
from mwmbl.indexer.paths import INDEX_NAME
from mwmbl.indexer.update_urls import update_urls_continuously
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
from mwmbl.url_queue import update_queue_continuously
class MwmblConfig(AppConfig): class MwmblConfig(AppConfig):
name = "mwmbl" name = "mwmbl"
verbose_name = "Mwmbl Application" verbose_name = "Mwmbl Application"
def ready(self): def ready(self):
# Imports here to avoid AppRegistryNotReady exception
from mwmbl.api import queued_batches
from mwmbl import background
from mwmbl.indexer.paths import INDEX_NAME
from mwmbl.indexer.update_urls import update_urls_continuously
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
from mwmbl.url_queue import update_queue_continuously
index_path = Path(settings.DATA_PATH) / INDEX_NAME index_path = Path(settings.DATA_PATH) / INDEX_NAME
try: try:
existing_index = TinyIndex(item_factory=Document, index_path=index_path) existing_index = TinyIndex(item_factory=Document, index_path=index_path)

View file

@ -0,0 +1,58 @@
# Generated by Django 4.2.6 on 2023-10-25 11:55
from django.conf import settings
import django.contrib.auth.models
import django.contrib.auth.validators
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
class Migration(migrations.Migration):
initial = True
dependencies = [
('auth', '0012_alter_user_first_name_max_length'),
]
operations = [
migrations.CreateModel(
name='MwmblUser',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('password', models.CharField(max_length=128, verbose_name='password')),
('last_login', models.DateTimeField(blank=True, null=True, verbose_name='last login')),
('is_superuser', models.BooleanField(default=False, help_text='Designates that this user has all permissions without explicitly assigning them.', verbose_name='superuser status')),
('username', models.CharField(error_messages={'unique': 'A user with that username already exists.'}, help_text='Required. 150 characters or fewer. Letters, digits and @/./+/-/_ only.', max_length=150, unique=True, validators=[django.contrib.auth.validators.UnicodeUsernameValidator()], verbose_name='username')),
('first_name', models.CharField(blank=True, max_length=150, verbose_name='first name')),
('last_name', models.CharField(blank=True, max_length=150, verbose_name='last name')),
('email', models.EmailField(blank=True, max_length=254, verbose_name='email address')),
('is_staff', models.BooleanField(default=False, help_text='Designates whether the user can log into this admin site.', verbose_name='staff status')),
('is_active', models.BooleanField(default=True, help_text='Designates whether this user should be treated as active. Unselect this instead of deleting accounts.', verbose_name='active')),
('date_joined', models.DateTimeField(default=django.utils.timezone.now, verbose_name='date joined')),
('groups', models.ManyToManyField(blank=True, help_text='The groups this user belongs to. A user will get all permissions granted to each of their groups.', related_name='user_set', related_query_name='user', to='auth.group', verbose_name='groups')),
('user_permissions', models.ManyToManyField(blank=True, help_text='Specific permissions for this user.', related_name='user_set', related_query_name='user', to='auth.permission', verbose_name='user permissions')),
],
options={
'verbose_name': 'user',
'verbose_name_plural': 'users',
'abstract': False,
},
managers=[
('objects', django.contrib.auth.models.UserManager()),
],
),
migrations.CreateModel(
name='UserCuration',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('timestamp', models.DateTimeField()),
('url', models.CharField(max_length=300)),
('results', models.JSONField()),
('curation_type', models.CharField(max_length=20)),
('curation', models.JSONField()),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
),
]

View file

15
mwmbl/models.py Normal file
View file

@ -0,0 +1,15 @@
from django.db import models
from django.contrib.auth.models import AbstractUser
class MwmblUser(AbstractUser):
pass
class UserCuration(models.Model):
user = models.ForeignKey(MwmblUser, on_delete=models.CASCADE)
timestamp = models.DateTimeField()
url = models.CharField(max_length=300)
results = models.JSONField()
curation_type = models.CharField(max_length=20)
curation = models.JSONField()

82
mwmbl/platform/curate.py Normal file
View file

@ -0,0 +1,82 @@
import json
from urllib.parse import urljoin, parse_qs
import requests
from ninja import Router
from ninja.security import django_auth
from mwmbl.indexer.update_urls import get_datetime_from_timestamp
from mwmbl.models import UserCuration
from mwmbl.platform.data import CurateBegin, CurateMove, CurateDelete, CurateAdd, CurateValidate, Curation
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tokenizer import tokenize
RESULT_URL = "https://mwmbl.org/?q="
MAX_CURATED_SCORE = 1_111_111.0
def create_router(index_path: str) -> Router:
router = Router(tags=["user"])
@router.post("/begin", auth=django_auth)
def user_begin_curate(request, curate_begin: CurateBegin):
return _curate(request, "curate_begin", curate_begin)
@router.post("/move", auth=django_auth)
def user_move_result(request, curate_move: Curation[CurateMove]):
return _curate(request, "curate_move", curate_move)
@router.post("/delete", auth=django_auth)
def user_delete_result(request, curate_delete: Curation[CurateDelete]):
return _curate(request, "curate_delete", curate_delete)
@router.post("/add", auth=django_auth)
def user_add_result(request, curate_add: Curation[CurateAdd]):
return _curate(request, "curate_add", curate_add)
@router.post("/validate", auth=django_auth)
def user_add_result(request, curate_validate: Curation[CurateValidate]):
return _curate(request, "curate_validate", curate_validate)
def _curate(request, curation_type: str, curation: Curation):
user_curation = UserCuration(
user=request.user,
timestamp=get_datetime_from_timestamp(curation.timestamp),
url=curation.url,
results=curation.dict()["results"],
curation_type=curation_type,
curation=curation.curation.dict(),
)
user_curation.save()
with TinyIndex(Document, index_path, 'w') as indexer:
query_string = parse_qs(curation.url)
if len(query_string) > 1:
raise ValueError(f"Should be one query string in the URL: {curation.url}")
queries = next(iter(query_string.values()))
if len(queries) > 1:
raise ValueError(f"Should be one query value in the URL: {curation.url}")
query = queries[0]
print("Query", query)
tokens = tokenize(query)
print("Tokens", tokens)
term = " ".join(tokens)
print("Key", term)
documents = [
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
for i, result in enumerate(curation.results)
]
page_index = indexer.get_key_page_index(term)
print("Page index", page_index)
print("Storing documents", documents)
indexer.store_in_page(page_index, documents)
return {"curation": "ok"}
return router

45
mwmbl/platform/data.py Normal file
View file

@ -0,0 +1,45 @@
from datetime import datetime
from typing import TypeVar, Generic
from ninja import Schema
class Result(Schema):
url: str
title: str
extract: str
curated: bool
class CurateBegin(Schema):
pass
class CurateMove(Schema):
old_index: int
new_index: int
class CurateDelete(Schema):
delete_index: int
class CurateAdd(Schema):
insert_index: int
url: str
class CurateValidate(Schema):
validate_index: int
is_validated: bool
T = TypeVar('T', CurateBegin, CurateAdd, CurateDelete, CurateMove, CurateValidate)
class Curation(Schema, Generic[T]):
timestamp: int
url: str
results: list[Result]
curation: T

View file

@ -1,190 +0,0 @@
import json
import os
from typing import TypeVar, Generic
from urllib.parse import urljoin, parse_qs
import requests
from fastapi import APIRouter, Response
from pydantic import BaseModel
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tokenizer import tokenize
LEMMY_URL = os.environ["LEMMY_URL"]
RESULT_URL = "https://mwmbl.org/?q="
MAX_CURATED_SCORE = 1_111_111.0
class Register(BaseModel):
username: str
email: str
password: str
password_verify: str
class Login(BaseModel):
username_or_email: str
password: str
class Result(BaseModel):
url: str
title: str
extract: str
curated: bool
class BeginCurate(BaseModel):
auth: str
url: str
results: list[Result]
class CurateMove(BaseModel):
old_index: int
new_index: int
class CurateDelete(BaseModel):
delete_index: int
class CurateAdd(BaseModel):
insert_index: int
url: str
class CurateValidate(BaseModel):
validate_index: int
is_validated: bool
T = TypeVar('T', CurateAdd, CurateDelete, CurateMove, CurateValidate)
class Curation(BaseModel, Generic[T]):
auth: str
curation_id: int
url: str
results: list[Result]
curation: T
def create_router(index_path: str) -> APIRouter:
router = APIRouter(prefix="/user", tags=["user"])
# TODO: reinstate
# community_id = get_community_id()
community_id = 0
@router.post("/register")
def user_register(register: Register) -> Response:
lemmy_register = {
"username": register.username,
"email": register.email,
"password": register.password,
"password_verify": register.password_verify,
"answer": "not applicable",
"captcha_answer": None,
"captcha_uuid": None,
"honeypot": None,
"show_nsfw": False,
}
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/register"), json=lemmy_register)
if request.status_code != 200:
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
@router.post("/login")
def user_login(login: Login) -> Response:
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/login"), json=login.dict())
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
@router.post("/curation/begin")
def user_begin_curate(begin_curate: BeginCurate):
results = begin_curate.dict()["results"]
body = json.dumps({"original_results": results}, indent=2)
create_post = {
"auth": begin_curate.auth,
"body": body,
"community_id": community_id,
"honeypot": None,
"language_id": None,
"name": begin_curate.url,
"nsfw": None,
"url": begin_curate.url,
}
request = requests.post(urljoin(LEMMY_URL, "api/v3/post"), json=create_post)
if request.status_code != 200:
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
data = request.json()
curation_id = data["post_view"]["post"]["id"]
return {"curation_id": curation_id}
@router.post("/curation/move")
def user_move_result(curate_move: Curation[CurateMove]):
return _curate("curate_move", curate_move)
@router.post("/curation/delete")
def user_delete_result(curate_delete: Curation[CurateDelete]):
return _curate("curate_delete", curate_delete)
@router.post("/curation/add")
def user_add_result(curate_add: Curation[CurateAdd]):
return _curate("curate_add", curate_add)
@router.post("/curation/validate")
def user_add_result(curate_validate: Curation[CurateValidate]):
return _curate("curate_validate", curate_validate)
def _curate(curation_type: str, curation: Curation):
content = json.dumps({
"curation_type": curation_type,
"curation": curation.curation.dict(),
}, indent=2)
create_comment = {
"auth": curation.auth,
"content": json.dumps(content, indent=2),
"form_id": None,
"language_id": None,
"parent_id": None,
"post_id": curation.curation_id,
}
request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment)
with TinyIndex(Document, index_path, 'w') as indexer:
query_string = parse_qs(curation.url)
if len(query_string) > 1:
raise ValueError(f"Should be one query string in the URL: {curation.url}")
queries = next(iter(query_string.values()))
if len(queries) > 1:
raise ValueError(f"Should be one query value in the URL: {curation.url}")
query = queries[0]
print("Query", query)
tokens = tokenize(query)
print("Tokens", tokens)
term = " ".join(tokens)
print("Key", term)
documents = [
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
for i, result in enumerate(curation.results)
]
page_index = indexer.get_key_page_index(term)
print("Page index", page_index)
print("Storing documents", documents)
indexer.store_in_page(page_index, documents)
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
return router
def get_community_id() -> str:
request = requests.get(urljoin(LEMMY_URL, "api/v3/community?name=main"))
community = request.json()
return community["community_view"]["community"]["id"]

View file

@ -119,7 +119,6 @@ USE_TZ = True
STATIC_URL = 'static/' STATIC_URL = 'static/'
STATICFILES_DIRS = [str(Path(__file__).parent.parent / "front-end" / "dist")] STATICFILES_DIRS = [str(Path(__file__).parent.parent / "front-end" / "dist")]
print("Static files", STATICFILES_DIRS)
# Default primary key field type # Default primary key field type
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
@ -134,5 +133,10 @@ AUTHENTICATION_BACKENDS = [
'allauth.account.auth_backends.AuthenticationBackend', 'allauth.account.auth_backends.AuthenticationBackend',
] ]
AUTH_USER_MODEL = "mwmbl.MwmblUser"
ACCOUNT_EMAIL_REQUIRED = True ACCOUNT_EMAIL_REQUIRED = True
ACCOUNT_EMAIL_VERIFICATION = "mandatory" ACCOUNT_EMAIL_VERIFICATION = "mandatory"