Files
BibAPI/tests/test_marcxml_parser.py
WorldTeacher 2a98718699
Some checks failed
/ typecheck (pull_request) Failing after 11s
tests: add more tests
2025-12-09 09:17:13 +01:00

487 lines
17 KiB
Python

"""Tests for MARCXML parsing functions in sru.py."""
import xml.etree.ElementTree as ET
import pytest
from bibapi.schemas.marcxml import (
DataField,
SubField,
)
from bibapi.sru import (
_smart_join_title,
_text,
controlfield_value,
datafield_value,
datafields_value,
find_datafields_with_subfields,
first_subfield_value,
first_subfield_value_from_fields,
iter_datafields,
parse_marc_record,
parse_search_retrieve_response,
subfield_values,
subfield_values_from_fields,
)
# --- Fixtures for sample XML data ---
@pytest.fixture
def minimal_marc_xml() -> str:
"""Minimal MARC record XML string."""
return """<?xml version="1.0" encoding="UTF-8"?>
<marc:record xmlns:marc="http://www.loc.gov/MARC21/slim">
<marc:leader>00000nam a22000001i 4500</marc:leader>
<marc:controlfield tag="001">PPN12345</marc:controlfield>
<marc:controlfield tag="005">20230101120000.0</marc:controlfield>
<marc:datafield tag="245" ind1="1" ind2="0">
<marc:subfield code="a">Test Title</marc:subfield>
<marc:subfield code="b">A Subtitle</marc:subfield>
</marc:datafield>
</marc:record>"""
@pytest.fixture
def full_marc_xml() -> str:
"""More complete MARC record for testing."""
return """<?xml version="1.0" encoding="UTF-8"?>
<marc:record xmlns:marc="http://www.loc.gov/MARC21/slim">
<marc:leader>00000nam a22000001i 4500</marc:leader>
<marc:controlfield tag="001">PPN98765</marc:controlfield>
<marc:controlfield tag="005">20231215150000.0</marc:controlfield>
<marc:controlfield tag="008">230101s2023 gw 000 0 ger d</marc:controlfield>
<marc:datafield tag="020" ind1=" " ind2=" ">
<marc:subfield code="a">9783123456789</marc:subfield>
</marc:datafield>
<marc:datafield tag="020" ind1=" " ind2=" ">
<marc:subfield code="a">9783987654321</marc:subfield>
</marc:datafield>
<marc:datafield tag="041" ind1=" " ind2=" ">
<marc:subfield code="a">ger</marc:subfield>
<marc:subfield code="a">eng</marc:subfield>
</marc:datafield>
<marc:datafield tag="245" ind1="1" ind2="0">
<marc:subfield code="a">Comprehensive Test Book</marc:subfield>
<marc:subfield code="b">With Many Details</marc:subfield>
<marc:subfield code="c">by Author Name</marc:subfield>
</marc:datafield>
<marc:datafield tag="250" ind1=" " ind2=" ">
<marc:subfield code="a">3rd edition</marc:subfield>
</marc:datafield>
<marc:datafield tag="264" ind1=" " ind2="1">
<marc:subfield code="a">Berlin</marc:subfield>
<marc:subfield code="b">Test Publisher</marc:subfield>
<marc:subfield code="c">2023</marc:subfield>
</marc:datafield>
<marc:datafield tag="300" ind1=" " ind2=" ">
<marc:subfield code="a">456 pages</marc:subfield>
</marc:datafield>
<marc:datafield tag="338" ind1=" " ind2=" ">
<marc:subfield code="a">Band</marc:subfield>
</marc:datafield>
<marc:datafield tag="700" ind1="1" ind2=" ">
<marc:subfield code="a">Author, First</marc:subfield>
</marc:datafield>
<marc:datafield tag="700" ind1="1" ind2=" ">
<marc:subfield code="a">Author, Second</marc:subfield>
</marc:datafield>
<marc:datafield tag="924" ind1=" " ind2=" ">
<marc:subfield code="9">Frei 129</marc:subfield>
<marc:subfield code="g">ABC 123</marc:subfield>
<marc:subfield code="b">DE-Frei129</marc:subfield>
</marc:datafield>
</marc:record>"""
@pytest.fixture
def sru_response_xml() -> bytes:
"""Complete SRU searchRetrieveResponse XML."""
return b"""<?xml version="1.0" encoding="UTF-8"?>
<zs:searchRetrieveResponse xmlns:zs="http://www.loc.gov/zing/srw/"
xmlns:marc="http://www.loc.gov/MARC21/slim">
<zs:version>1.1</zs:version>
<zs:numberOfRecords>2</zs:numberOfRecords>
<zs:records>
<zs:record>
<zs:recordSchema>marcxml</zs:recordSchema>
<zs:recordPacking>xml</zs:recordPacking>
<zs:recordData>
<marc:record>
<marc:leader>00000nam a22</marc:leader>
<marc:controlfield tag="001">PPN001</marc:controlfield>
<marc:datafield tag="245" ind1=" " ind2=" ">
<marc:subfield code="a">First Book</marc:subfield>
</marc:datafield>
</marc:record>
</zs:recordData>
<zs:recordPosition>1</zs:recordPosition>
</zs:record>
<zs:record>
<zs:recordSchema>marcxml</zs:recordSchema>
<zs:recordPacking>xml</zs:recordPacking>
<zs:recordData>
<marc:record>
<marc:leader>00000nam a22</marc:leader>
<marc:controlfield tag="001">PPN002</marc:controlfield>
<marc:datafield tag="245" ind1=" " ind2=" ">
<marc:subfield code="a">Second Book</marc:subfield>
</marc:datafield>
</marc:record>
</zs:recordData>
<zs:recordPosition>2</zs:recordPosition>
</zs:record>
</zs:records>
<zs:echoedSearchRetrieveRequest>
<zs:version>1.1</zs:version>
<zs:query>pica.tit=Test</zs:query>
<zs:maximumRecords>100</zs:maximumRecords>
<zs:recordPacking>xml</zs:recordPacking>
<zs:recordSchema>marcxml</zs:recordSchema>
</zs:echoedSearchRetrieveRequest>
</zs:searchRetrieveResponse>"""
@pytest.fixture
def sru_response_no_records() -> bytes:
"""SRU response with zero records."""
return b"""<?xml version="1.0" encoding="UTF-8"?>
<zs:searchRetrieveResponse xmlns:zs="http://www.loc.gov/zing/srw/">
<zs:version>1.1</zs:version>
<zs:numberOfRecords>0</zs:numberOfRecords>
</zs:searchRetrieveResponse>"""
# --- Tests for _text helper ---
class TestTextHelper:
def test_text_with_element_and_text(self):
elem = ET.fromstring("<tag>Hello</tag>")
assert _text(elem) == "Hello"
def test_text_with_element_no_text(self):
elem = ET.fromstring("<tag></tag>")
assert _text(elem) == ""
def test_text_with_none(self):
assert _text(None) == ""
def test_text_with_whitespace(self):
elem = ET.fromstring("<tag> spaced </tag>")
assert _text(elem) == " spaced "
# --- Tests for parse_marc_record ---
class TestParseMarcRecord:
def test_parse_minimal_record(self, minimal_marc_xml):
root = ET.fromstring(minimal_marc_xml)
record = parse_marc_record(root)
assert record.leader == "00000nam a22000001i 4500"
assert len(record.controlfields) == 2
assert record.controlfields[0].tag == "001"
assert record.controlfields[0].value == "PPN12345"
def test_parse_datafields(self, minimal_marc_xml):
root = ET.fromstring(minimal_marc_xml)
record = parse_marc_record(root)
assert len(record.datafields) == 1
df = record.datafields[0]
assert df.tag == "245"
assert df.ind1 == "1"
assert df.ind2 == "0"
assert len(df.subfields) == 2
assert df.subfields[0].code == "a"
assert df.subfields[0].value == "Test Title"
def test_parse_full_record(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
assert len(record.controlfields) == 3
# Check multiple datafields
tags = [df.tag for df in record.datafields]
assert "020" in tags
assert "245" in tags
assert "700" in tags
assert "924" in tags
def test_parse_multiple_subfields_same_code(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
# Find 041 field with multiple $a subfields
df_041 = next(df for df in record.datafields if df.tag == "041")
a_values = [sf.value for sf in df_041.subfields if sf.code == "a"]
assert a_values == ["ger", "eng"]
# --- Tests for parse_search_retrieve_response ---
class TestParseSearchRetrieveResponse:
def test_parse_response_with_records(self, sru_response_xml):
response = parse_search_retrieve_response(sru_response_xml)
assert response.version == "1.1"
assert response.numberOfRecords == 2
assert len(response.records) == 2
def test_parse_response_record_details(self, sru_response_xml):
response = parse_search_retrieve_response(sru_response_xml)
rec1 = response.records[0]
assert rec1.recordSchema == "marcxml"
assert rec1.recordPacking == "xml"
assert rec1.recordPosition == 1
assert controlfield_value(rec1.recordData, "001") == "PPN001"
def test_parse_response_no_records(self, sru_response_no_records):
response = parse_search_retrieve_response(sru_response_no_records)
assert response.version == "1.1"
assert response.numberOfRecords == 0
assert len(response.records) == 0
def test_parse_echoed_request(self, sru_response_xml):
response = parse_search_retrieve_response(sru_response_xml)
echoed = response.echoedSearchRetrieveRequest
assert echoed is not None
assert echoed.version == "1.1"
assert echoed.query == "pica.tit=Test"
assert echoed.maximumRecords == 100
assert echoed.recordSchema == "marcxml"
def test_parse_response_as_string(self, sru_response_xml):
# Should also work with string input
response = parse_search_retrieve_response(sru_response_xml.decode("utf-8"))
assert response.numberOfRecords == 2
# --- Tests for query helper functions ---
class TestIterDatafields:
def test_iter_all_datafields(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
all_fields = list(iter_datafields(record))
assert len(all_fields) == len(record.datafields)
def test_iter_datafields_by_tag(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
fields_020 = list(iter_datafields(record, tag="020"))
assert len(fields_020) == 2 # Two ISBN fields
def test_iter_datafields_by_indicator(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
fields = list(iter_datafields(record, tag="264", ind2="1"))
assert len(fields) == 1
class TestSubfieldValues:
def test_subfield_values_single(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
values = subfield_values(record, "245", "a")
assert values == ["Comprehensive Test Book"]
def test_subfield_values_multiple(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
# Multiple ISBN values
values = subfield_values(record, "020", "a")
assert len(values) == 2
assert "9783123456789" in values
assert "9783987654321" in values
def test_subfield_values_empty(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
values = subfield_values(record, "999", "x")
assert values == []
class TestFirstSubfieldValue:
def test_first_subfield_value_found(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
value = first_subfield_value(record, "245", "a")
assert value == "Comprehensive Test Book"
def test_first_subfield_value_not_found(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
value = first_subfield_value(record, "999", "x")
assert value is None
def test_first_subfield_value_with_default(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
value = first_subfield_value(record, "999", "x", default="N/A")
assert value == "N/A"
def test_first_subfield_value_with_indicator(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
value = first_subfield_value(record, "264", "c", ind2="1")
assert value == "2023"
class TestControlFieldValue:
def test_controlfield_value_found(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
value = controlfield_value(record, "001")
assert value == "PPN98765"
def test_controlfield_value_not_found(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
value = controlfield_value(record, "999")
assert value is None
def test_controlfield_value_with_default(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
value = controlfield_value(record, "999", default="unknown")
assert value == "unknown"
class TestFindDatafieldsWithSubfields:
def test_find_with_where_all(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
fields = find_datafields_with_subfields(
record,
"924",
where_all={"9": "Frei 129"},
)
assert len(fields) == 1
assert fields[0].tag == "924"
def test_find_with_where_all_not_found(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
fields = find_datafields_with_subfields(
record,
"924",
where_all={"9": "NonExistent"},
)
assert len(fields) == 0
def test_find_with_casefold(self, full_marc_xml):
root = ET.fromstring(full_marc_xml)
record = parse_marc_record(root)
fields = find_datafields_with_subfields(
record,
"924",
where_all={"9": "frei 129"}, # lowercase
casefold=True,
)
assert len(fields) == 1
class TestDatafieldValue:
def test_datafield_value_found(self):
df = DataField(
tag="245",
subfields=[
SubField(code="a", value="Title"),
SubField(code="b", value="Subtitle"),
],
)
assert datafield_value(df, "a") == "Title"
assert datafield_value(df, "b") == "Subtitle"
def test_datafield_value_not_found(self):
df = DataField(tag="245", subfields=[SubField(code="a", value="Title")])
assert datafield_value(df, "z") is None
def test_datafield_value_with_default(self):
df = DataField(tag="245", subfields=[])
assert datafield_value(df, "a", default="N/A") == "N/A"
class TestDatafieldsValue:
def test_datafields_value_found(self):
fields = [
DataField(tag="700", subfields=[SubField(code="a", value="Author One")]),
DataField(tag="700", subfields=[SubField(code="a", value="Author Two")]),
]
assert datafields_value(fields, "a") == "Author One"
def test_datafields_value_empty_list(self):
assert datafields_value([], "a") is None
class TestSubfieldValuesFromFields:
def test_values_from_multiple_fields(self):
fields = [
DataField(tag="700", subfields=[SubField(code="a", value="Author One")]),
DataField(tag="700", subfields=[SubField(code="a", value="Author Two")]),
]
values = subfield_values_from_fields(fields, "a")
assert values == ["Author One", "Author Two"]
class TestFirstSubfieldValueFromFields:
def test_first_value_from_fields(self):
fields = [
DataField(tag="700", subfields=[SubField(code="a", value="First")]),
DataField(tag="700", subfields=[SubField(code="a", value="Second")]),
]
assert first_subfield_value_from_fields(fields, "a") == "First"
# --- Tests for _smart_join_title ---
class TestSmartJoinTitle:
def test_join_with_subtitle(self):
result = _smart_join_title("Main Title", "Subtitle")
assert result == "Main Title : Subtitle"
def test_join_without_subtitle(self):
result = _smart_join_title("Main Title", None)
assert result == "Main Title"
def test_join_with_empty_subtitle(self):
result = _smart_join_title("Main Title", "")
assert result == "Main Title"
def test_join_with_existing_colon(self):
result = _smart_join_title("Main Title:", "Subtitle")
assert result == "Main Title: Subtitle"
def test_join_with_existing_semicolon(self):
result = _smart_join_title("Main Title;", "More")
assert result == "Main Title; More"
def test_join_strips_whitespace(self):
result = _smart_join_title(" Main Title ", " Subtitle ")
assert result == "Main Title : Subtitle"