Feather DB for Research Assistants: Memory That Accumulates Across Sessions

The research assistant memory problem

Research assistants fail at the knowledge accumulation problem. You ingest 50 papers in session 1, ask questions, get answers. In session 2, you have to re-ingest the same papers or start fresh. And even within a session, the assistant doesn't know which papers contradict each other, which papers a key claim cites, or which open questions have accumulated evidence over time.

A Feather DB-backed research assistant solves this: papers are ingested once and persist. Edges link citation relationships and contradictions. Open research questions track evidence as it accumulates. And different paper types decay at different rates — foundational papers stay relevant indefinitely, preprints fade as they're superseded.

Step 1: Paper ingestion with metadata

import feather_db as fdb
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional

db = fdb.DB.open("research_memory.feather", dim=768)

@dataclass
class Paper:
    title: str
    abstract: str
    authors: List[str]
    year: int
    key_claims: List[str]
    venue: str  # "NeurIPS 2024", "arxiv", etc.
    arxiv_id: Optional[str] = None

def ingest_paper(paper: Paper, namespace: str, embed_fn) -> dict:
    """Ingest a paper and return memory IDs for edge linking."""
    mem_ids = {}

    # Determine half_life by venue
    if "arxiv" in paper.venue.lower():
        half_life = 90   # preprints: 3 months
        importance = 0.9
    elif paper.year >= datetime.now().year - 1:
        half_life = 365  # recent conference papers: 1 year
        importance = 1.2
    else:
        half_life = 730  # older papers: 2 years+ (foundational)
        importance = 1.5 if paper.year < datetime.now().year - 3 else 1.2

    # Ingest abstract as the primary representation
    abstract_text = f"{paper.title}. {paper.abstract}"
    vec = embed_fn(abstract_text)
    abstract_mem = db.add(vec, text=abstract_text,
                           namespace=namespace,
                           entity="papers")
    abstract_mem.meta.set_attribute("paper_title", paper.title)
    abstract_mem.meta.set_attribute("authors", ", ".join(paper.authors[:3]))
    abstract_mem.meta.set_attribute("year", paper.year)
    abstract_mem.meta.set_attribute("venue", paper.venue)
    abstract_mem.meta.set_attribute("type", "abstract")
    abstract_mem.meta.set_attribute("importance", importance)
    abstract_mem.meta.set_attribute("half_life", half_life)
    if paper.arxiv_id:
        abstract_mem.meta.set_attribute("arxiv_id", paper.arxiv_id)
    mem_ids["abstract"] = abstract_mem.id

    # Ingest each key claim separately
    claim_ids = []
    for i, claim in enumerate(paper.key_claims):
        claim_text = f"[{paper.title}] {claim}"
        vec = embed_fn(claim_text)
        claim_mem = db.add(vec, text=claim_text,
                            namespace=namespace,
                            entity="claims")
        claim_mem.meta.set_attribute("paper_title", paper.title)
        claim_mem.meta.set_attribute("claim_index", i)
        claim_mem.meta.set_attribute("type", "claim")
        claim_mem.meta.set_attribute("importance", importance * 1.1)  # claims slightly higher
        claim_mem.meta.set_attribute("half_life", half_life)

        # Abstract supports each of its own claims
        db.add_edge(abstract_mem.id, claim_mem.id, edge_type="supports")
        claim_ids.append(claim_mem.id)

    mem_ids["claims"] = claim_ids
    return mem_ids

Step 2: Edge linking — citations and contradictions

def link_citation(citing_title: str, cited_title: str,
                   namespace: str, db: fdb.DB, embed_fn):
    """Link two papers with a cites/supports relationship."""
    # Find the abstract memories for both papers
    citing_results = db.search(
        embed_fn(citing_title), k=1, namespace=namespace,
        filter={"type": "abstract"}
    )
    cited_results = db.search(
        embed_fn(cited_title), k=1, namespace=namespace,
        filter={"type": "abstract"}
    )

    if citing_results and cited_results:
        db.add_edge(citing_results[0].id, cited_results[0].id,
                    edge_type="supports")
        print(f"Linked: {citing_title[:50]} -> {cited_title[:50]}")

def link_contradiction(paper_a_title: str, paper_b_title: str,
                        claim_a: str, claim_b: str,
                        namespace: str, db: fdb.DB, embed_fn):
    """Link two contradicting claims with a contradicts edge."""
    # Find specific claim memories
    results_a = db.search(
        embed_fn(f"[{paper_a_title}] {claim_a}"), k=1, namespace=namespace,
        filter={"type": "claim"}
    )
    results_b = db.search(
        embed_fn(f"[{paper_b_title}] {claim_b}"), k=1, namespace=namespace,
        filter={"type": "claim"}
    )

    if results_a and results_b:
        db.add_edge(results_a[0].id, results_b[0].id,
                    edge_type="contradicts")
        db.add_edge(results_b[0].id, results_a[0].id,
                    edge_type="contradicts")
        print(f"Contradiction linked between {paper_a_title[:30]} and {paper_b_title[:30]}")

Step 3: Research question tracking

def add_research_question(question: str, namespace: str, embed_fn):
    """Add an open research question to track evidence for."""
    vec = embed_fn(question)
    mem = db.add(vec, text=question,
                  namespace=namespace,
                  entity="research-questions")
    mem.meta.set_attribute("type", "open_question")
    mem.meta.set_attribute("importance", 2.0)
    mem.meta.set_attribute("evidence_count", 0)
    return mem

def add_evidence(question_text: str, evidence_text: str, paper_title: str,
                  namespace: str, embed_fn, supports: bool = True):
    """Link evidence from a paper to a research question."""
    q_vec = embed_fn(question_text)
    e_vec = embed_fn(evidence_text)

    # Find the question
    q_results = db.search(q_vec, k=1, namespace=namespace,
                           filter={"type": "open_question"})
    if not q_results:
        return None

    question_mem = q_results[0]
    evidence_mem = db.add(e_vec,
                           text=f"[{paper_title}] {evidence_text}",
                           namespace=namespace,
                           entity="evidence")
    evidence_mem.meta.set_attribute("type", "evidence")
    evidence_mem.meta.set_attribute("supports_question", supports)
    evidence_mem.meta.set_attribute("paper_title", paper_title)

    edge_type = "supports" if supports else "contradicts"
    db.add_edge(evidence_mem.id, question_mem.id, edge_type=edge_type)

    # Increment evidence count
    count = int(question_mem.meta.get_attribute("evidence_count") or 0)
    question_mem.meta.set_attribute("evidence_count", count + 1)

    return evidence_mem

Step 4: context_chain for full paper context

def research_query(query: str, namespace: str, embed_fn, k: int = 5):
    """Query the research memory and surface full citation context."""
    vec = embed_fn(query)

    # context_chain: find relevant papers + traverse citations + contradictions
    chain = db.context_chain(
        vec,
        k=k,
        namespace=namespace,
        max_depth=2,   # 2 hops = paper -> citation -> their citations
        half_life=365  # research stays relevant for a year
    )

    # Group results by type
    papers, claims, evidence, questions = [], [], [], []
    for mem in chain:
        t = mem.meta.get_attribute("type")
        if t == "abstract":      papers.append(mem)
        elif t == "claim":       claims.append(mem)
        elif t == "evidence":    evidence.append(mem)
        elif t == "open_question": questions.append(mem)

    return {"papers": papers, "claims": claims,
            "evidence": evidence, "questions": questions}

# Usage
results = research_query(
    query="Does attention mechanism benefit from sparse patterns?",
    namespace="phd-research",
    embed_fn=embed
)

print(f"Found {len(results['papers'])} relevant papers")
print(f"Found {len(results['claims'])} supporting/contradicting claims")
print(f"Found {len(results['questions'])} related open questions")

for paper_mem in results["papers"]:
    print(f"\n{paper_mem.meta.get_attribute('paper_title')} "
          f"({paper_mem.meta.get_attribute('year')}) "
          f"- score: {paper_mem.score:.3f}")
    print(f"  {paper_mem.text[:150]}...")

The result is a research assistant that gets smarter over time. Every paper you ingest becomes part of a knowledge graph. Every contradiction you identify is a permanent edge that will surface whenever either paper is retrieved. Every open question accumulates evidence from future ingested papers. And foundational papers — the ones you cite in every session — build stickiness through recall, making them effectively permanent fixtures in your knowledge base.

Install: pip install feather-db · GitHub: github.com/feather-store/feather