Skip to content

Pre-annotate data with EDS-NLP

This tutorial shows how to pre-annotate date spans with EDS-NLP and add a boolean mammography flag when the date appears close to a mammography mention.

The goal is to expose this flag in Metanno as a checkbox in the entities widgets.

Build a contextual date pipeline

Use eds.dates to detect dates, then eds.contextual_matcher to enrich each date with an assigned value when a mammography term appears in the local context.

import edsnlp
import edsnlp.pipes as eds

nlp = edsnlp.blank("eds")

nlp.add_pipe(eds.sentences())
nlp.add_pipe(eds.normalizer())
nlp.add_pipe(eds.dates())
nlp.add_pipe(
    eds.contextual_matcher(
        patterns=[
            dict(
                span_getter="dates",
                assign=[
                    dict(
                        name="mammography",
                        regex="(mammo-?graph)",
                        window="words[-10:10] & sent",
                        reduce_mode="keep_first",
                    ),
                ],
                source="mammography",
            ),
        ],
        label="date",
    ),
);

Run it on some data first to ensure it works:

texts = ["Le patient a eu une mammographie le 12 juin. Aucune autre mammographie prévue."]
nlp.pipe(texts).to_pandas(converter="ents", span_attributes=["assigned", "date"])
note_id start end label lexical_variant span_type assigned date
0 None 36 43 date 12 juin ents {'mammography': 'mammographie'} ????-06-12

Tip

If you only want dates with nearby mammography context, set required=True in the assign block of eds.contextual_matcher.

Apply the pipeline to some data

We'll assume you have a collection of documents you want to apply the pipeline too.

The example below shows how to load parquet data. Visit EDS-NLP's Data API docs to learn about other formats.

import uuid


def build_data():
    # Apply the pipeline
    # data = edsnlp.data.read_parquet(path_to_parquet_dataset)
    # or, if you don't have data yet, comment the above and run
    data = edsnlp.data.from_iterable(
        [{"note_id": f"#{i}", "note_text": t} for i, t in enumerate(texts)],
        converter="omop",
    )
    data = data.map_pipeline(nlp)

    # And assemble your data in collections of dicts
    notes = []
    for idx, doc in enumerate(data):
        note_entities = []

        for e in doc.ents:  # or doc.spans[...] if this is where your entities are
            if e.label_ != "date":
                continue
            assigned = e._.assigned or {}
            note_entities.append(
                {
                    "id": f"#{uuid.uuid4()}",
                    "text": str(e),
                    "begin": e.start_char,
                    "end": e.end_char,
                    "label": "date",
                    "concept": None,
                    "mammography": bool(assigned.get("mammography")),
                }
            )

        notes.append(
            {
                "note_id": str(doc._.note_id),
                "note_text": doc.text,
                "seen": False,
                "entities": note_entities,
            }
        )

    return {"notes": notes}

Build the widgets (note text + info form)

Create a DataWidgetFactory, then build:

  1. A note text view with editable entity fields (including mammography).
  2. An info view similar to Quaero: note form (with navigation buttons) + selected entity info.
from pret_joy import Box, Divider, Stack

from metanno.recipes.data_widget_factory import DataWidgetFactory, infer_fields

factory = DataWidgetFactory(
    data=build_data,
    # sync=True/path, enable sync to sync the user edits with the kernel/server
)

note_text_view, ent_view = factory.create_text_widget(
    store_text_key="notes",
    store_spans_key="notes.entities",
    text_key="note_text",
    text_primary_key="note_id",
    spans_primary_key="id",
    fields=infer_fields(
        [e for n in factory.data["notes"] for e in n["entities"]],
        visible_keys=["label", "mammography"],
        editable_keys=["label", "mammography"],
        categorical_keys=["label"],
    ),
    labels={
        "date": {"name": "Date", "color": "lightblue", "shortcut": "d"},
    },
)

note_form_view = factory.create_form_widget(
    store_key="notes",
    primary_key="note_id",
    fields=[
        {"key": "note_id", "kind": "text"},
        {"key": "seen", "kind": "boolean", "editable": True},
    ],
    add_navigation_buttons=True,
)

info_view = Stack(Box(note_form_view, sx={"m": "10px"}), Divider(), Box(ent_view, sx={"m": "10px"}))

mammography is inferred as a boolean field, so it is rendered as a checkbox/toggle editor.

You can now either view these widgets separately, or arrange them in a single layout like we did in the Run the Quaero Explorer tutorial.

from pret.react import div
from pret_simple_dock import Layout, Panel

note_header = factory.create_selected_field_view(
    store_key="notes",
    shown_key="note_id",
    fallback="Note",
)

layout = div(
    Layout(
        Panel(note_text_view, key="Note Text", header=note_header),
        Panel(info_view, key="Info"),
        default_config={
            "kind": "row",
            "children": [
                {"tabs": ["Note Text"], "size": 65},
                {"tabs": ["Info"], "size": 35},
            ],
        },
    ),
    style={
        "background": "var(--joy-palette-background-level2, #f0f0f0)",
        "width": "100%",
        "height": "100%",
        "minHeight": "420px",
        "--sd-background-color": "transparent",
    },
)

layout