Feather DB for Research Assistants: Memory That Accumulates Across Sessions
A research assistant should know what you've read, what you've concluded, and which papers contradict each other — without repeating ingestion every session. Here's how to build one that accumulates knowledge with Feather DB.
The research assistant memory problem
Research assistants fail at the knowledge accumulation problem. You ingest 50 papers in session 1, ask questions, get answers. In session 2, you have to re-ingest the same papers or start fresh. And even within a session, the assistant doesn't know which papers contradict each other, which papers a key claim cites, or which open questions have accumulated evidence over time.
A Feather DB-backed research assistant solves this: papers are ingested once and persist. Edges link citation relationships and contradictions. Open research questions track evidence as it accumulates. And different paper types decay at different rates — foundational papers stay relevant indefinitely, preprints fade as they're superseded.
Step 1: Paper ingestion with metadata
import feather_db as fdb
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional
db = fdb.DB.open("research_memory.feather", dim=768)
@dataclass
class Paper:
title: str
abstract: str
authors: List[str]
year: int
key_claims: List[str]
venue: str # "NeurIPS 2024", "arxiv", etc.
arxiv_id: Optional[str] = None
def ingest_paper(paper: Paper, namespace: str, embed_fn) -> dict:
"""Ingest a paper and return memory IDs for edge linking."""
mem_ids = {}
# Determine half_life by venue
if "arxiv" in paper.venue.lower():
half_life = 90 # preprints: 3 months
importance = 0.9
elif paper.year >= datetime.now().year - 1:
half_life = 365 # recent conference papers: 1 year
importance = 1.2
else:
half_life = 730 # older papers: 2 years+ (foundational)
importance = 1.5 if paper.year < datetime.now().year - 3 else 1.2
# Ingest abstract as the primary representation
abstract_text = f"{paper.title}. {paper.abstract}"
vec = embed_fn(abstract_text)
abstract_mem = db.add(vec, text=abstract_text,
namespace=namespace,
entity="papers")
abstract_mem.meta.set_attribute("paper_title", paper.title)
abstract_mem.meta.set_attribute("authors", ", ".join(paper.authors[:3]))
abstract_mem.meta.set_attribute("year", paper.year)
abstract_mem.meta.set_attribute("venue", paper.venue)
abstract_mem.meta.set_attribute("type", "abstract")
abstract_mem.meta.set_attribute("importance", importance)
abstract_mem.meta.set_attribute("half_life", half_life)
if paper.arxiv_id:
abstract_mem.meta.set_attribute("arxiv_id", paper.arxiv_id)
mem_ids["abstract"] = abstract_mem.id
# Ingest each key claim separately
claim_ids = []
for i, claim in enumerate(paper.key_claims):
claim_text = f"[{paper.title}] {claim}"
vec = embed_fn(claim_text)
claim_mem = db.add(vec, text=claim_text,
namespace=namespace,
entity="claims")
claim_mem.meta.set_attribute("paper_title", paper.title)
claim_mem.meta.set_attribute("claim_index", i)
claim_mem.meta.set_attribute("type", "claim")
claim_mem.meta.set_attribute("importance", importance * 1.1) # claims slightly higher
claim_mem.meta.set_attribute("half_life", half_life)
# Abstract supports each of its own claims
db.add_edge(abstract_mem.id, claim_mem.id, edge_type="supports")
claim_ids.append(claim_mem.id)
mem_ids["claims"] = claim_ids
return mem_ids
Step 2: Edge linking — citations and contradictions
def link_citation(citing_title: str, cited_title: str,
namespace: str, db: fdb.DB, embed_fn):
"""Link two papers with a cites/supports relationship."""
# Find the abstract memories for both papers
citing_results = db.search(
embed_fn(citing_title), k=1, namespace=namespace,
filter={"type": "abstract"}
)
cited_results = db.search(
embed_fn(cited_title), k=1, namespace=namespace,
filter={"type": "abstract"}
)
if citing_results and cited_results:
db.add_edge(citing_results[0].id, cited_results[0].id,
edge_type="supports")
print(f"Linked: {citing_title[:50]} -> {cited_title[:50]}")
def link_contradiction(paper_a_title: str, paper_b_title: str,
claim_a: str, claim_b: str,
namespace: str, db: fdb.DB, embed_fn):
"""Link two contradicting claims with a contradicts edge."""
# Find specific claim memories
results_a = db.search(
embed_fn(f"[{paper_a_title}] {claim_a}"), k=1, namespace=namespace,
filter={"type": "claim"}
)
results_b = db.search(
embed_fn(f"[{paper_b_title}] {claim_b}"), k=1, namespace=namespace,
filter={"type": "claim"}
)
if results_a and results_b:
db.add_edge(results_a[0].id, results_b[0].id,
edge_type="contradicts")
db.add_edge(results_b[0].id, results_a[0].id,
edge_type="contradicts")
print(f"Contradiction linked between {paper_a_title[:30]} and {paper_b_title[:30]}")
Step 3: Research question tracking
def add_research_question(question: str, namespace: str, embed_fn):
"""Add an open research question to track evidence for."""
vec = embed_fn(question)
mem = db.add(vec, text=question,
namespace=namespace,
entity="research-questions")
mem.meta.set_attribute("type", "open_question")
mem.meta.set_attribute("importance", 2.0)
mem.meta.set_attribute("evidence_count", 0)
return mem
def add_evidence(question_text: str, evidence_text: str, paper_title: str,
namespace: str, embed_fn, supports: bool = True):
"""Link evidence from a paper to a research question."""
q_vec = embed_fn(question_text)
e_vec = embed_fn(evidence_text)
# Find the question
q_results = db.search(q_vec, k=1, namespace=namespace,
filter={"type": "open_question"})
if not q_results:
return None
question_mem = q_results[0]
evidence_mem = db.add(e_vec,
text=f"[{paper_title}] {evidence_text}",
namespace=namespace,
entity="evidence")
evidence_mem.meta.set_attribute("type", "evidence")
evidence_mem.meta.set_attribute("supports_question", supports)
evidence_mem.meta.set_attribute("paper_title", paper_title)
edge_type = "supports" if supports else "contradicts"
db.add_edge(evidence_mem.id, question_mem.id, edge_type=edge_type)
# Increment evidence count
count = int(question_mem.meta.get_attribute("evidence_count") or 0)
question_mem.meta.set_attribute("evidence_count", count + 1)
return evidence_mem
Step 4: context_chain for full paper context
def research_query(query: str, namespace: str, embed_fn, k: int = 5):
"""Query the research memory and surface full citation context."""
vec = embed_fn(query)
# context_chain: find relevant papers + traverse citations + contradictions
chain = db.context_chain(
vec,
k=k,
namespace=namespace,
max_depth=2, # 2 hops = paper -> citation -> their citations
half_life=365 # research stays relevant for a year
)
# Group results by type
papers, claims, evidence, questions = [], [], [], []
for mem in chain:
t = mem.meta.get_attribute("type")
if t == "abstract": papers.append(mem)
elif t == "claim": claims.append(mem)
elif t == "evidence": evidence.append(mem)
elif t == "open_question": questions.append(mem)
return {"papers": papers, "claims": claims,
"evidence": evidence, "questions": questions}
# Usage
results = research_query(
query="Does attention mechanism benefit from sparse patterns?",
namespace="phd-research",
embed_fn=embed
)
print(f"Found {len(results['papers'])} relevant papers")
print(f"Found {len(results['claims'])} supporting/contradicting claims")
print(f"Found {len(results['questions'])} related open questions")
for paper_mem in results["papers"]:
print(f"\n{paper_mem.meta.get_attribute('paper_title')} "
f"({paper_mem.meta.get_attribute('year')}) "
f"- score: {paper_mem.score:.3f}")
print(f" {paper_mem.text[:150]}...")
The result is a research assistant that gets smarter over time. Every paper you ingest becomes part of a knowledge graph. Every contradiction you identify is a permanent edge that will surface whenever either paper is retrieved. Every open question accumulates evidence from future ingested papers. And foundational papers — the ones you cite in every session — build stickiness through recall, making them effectively permanent fixtures in your knowledge base.
Install: pip install feather-db · GitHub: github.com/feather-store/feather