from pathlib import Path
from tempfile import TemporaryDirectory

from zstandard import ZstdCompressor

from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _binary_search_fitting_size, astuple, \
    _trim_items_to_page, _pad_to_page_size, _get_page_data


def test_create_index():
    num_pages = 10
    page_size = 4096

    with TemporaryDirectory() as temp_dir:
        index_path = Path(temp_dir) / 'temp-index.tinysearch'
        with TinyIndex.create(Document, str(index_path), num_pages=num_pages, page_size=page_size) as indexer:
            for i in range(num_pages):
                page = indexer.get_page(i)
                assert page == []

def test_binary_search_fitting_size_all_fit():
    items = [1,2,3,4,5,6,7,8,9]
    compressor = ZstdCompressor()
    page_size = 4096
    count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items))
    
    # We should fit everything
    assert count_fit == len(items)
    
def test_binary_search_fitting_size_subset_fit():
    items = [1,2,3,4,5,6,7,8,9]
    compressor = ZstdCompressor()
    page_size = 15
    count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items))
    
    # We should not fit everything
    assert count_fit < len(items)
    
def test_binary_search_fitting_size_none_fit():
    items = [1,2,3,4,5,6,7,8,9]
    compressor = ZstdCompressor()
    page_size = 5
    count_fit, data = _binary_search_fitting_size(compressor,page_size,items,0,len(items))
    
    # We should not fit anything
    assert count_fit == -1
    assert data is None
    
def test_get_page_data_single_doc():
    document1 = Document(title='title1',url='url1',extract='extract1',score=1.0)
    documents = [document1]
    items = [astuple(value) for value in documents]
    
    compressor = ZstdCompressor()
    page_size = 4096
    
    # Trim data
    num_fitting,trimmed_data = _trim_items_to_page(compressor,4096,items)
    
    # We should be able to fit the 1 item into a page
    assert num_fitting == 1
    
    # Compare the trimmed data to the actual data we're persisting
    # We need to pad the trimmmed data, then it should be equal to the data we persist
    padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size)
    serialized_data = _get_page_data(compressor,page_size,items)
    assert serialized_data == padded_trimmed_data
    

def test_get_page_data_many_docs_all_fit():
    # Build giant documents item
    documents = []
    documents_len = 500
    page_size = 4096
    for x in range(documents_len):
        txt = 'text{}'.format(x)
        document = Document(title=txt,url=txt,extract=txt,score=x)
        documents.append(document)
    items = [astuple(value) for value in documents]
    
    # Trim the items
    compressor = ZstdCompressor()
    num_fitting,trimmed_data = _trim_items_to_page(compressor,page_size,items)
    
    # We should be able to fit all items
    assert num_fitting == documents_len
    
    # Compare the trimmed data to the actual data we're persisting
    # We need to pad the trimmed data, then it should be equal to the data we persist
    serialized_data = _get_page_data(compressor,page_size,items)
    padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size)
    
    assert serialized_data == padded_trimmed_data
    
def test_get_page_data_many_docs_subset_fit():
    # Build giant documents item
    documents = []
    documents_len = 5000
    page_size = 4096
    for x in range(documents_len):
        txt = 'text{}'.format(x)
        document = Document(title=txt,url=txt,extract=txt,score=x)
        documents.append(document)
    items = [astuple(value) for value in documents]
    
    # Trim the items
    compressor = ZstdCompressor()
    num_fitting,trimmed_data = _trim_items_to_page(compressor,page_size,items)
    
    # We should be able to fit a subset of the items onto the page
    assert num_fitting > 1
    assert num_fitting < documents_len
    
    # Compare the trimmed data to the actual data we're persisting
    # We need to pad the trimmed data, then it should be equal to the data we persist
    serialized_data = _get_page_data(compressor,page_size,items)
    padded_trimmed_data = _pad_to_page_size(trimmed_data, page_size)
    
    assert serialized_data == padded_trimmed_data