Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions docs/articles/langchain-vc-memo-agent/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# VC Investment Memo Agent with LangGraph

Runnable code for the cookbook guide at
https://docs.perplexity.ai/docs/cookbook/articles/langchain-vc-memo-agent/README

```bash
cd scripts
pip install -r requirements.txt
export PPLX_API_KEY="pplx-..."
export LANGSMITH_API_KEY="ls__..."
export LANGSMITH_TRACING="true"
python -m memo --company "Anthropic"
```
11 changes: 11 additions & 0 deletions docs/articles/langchain-vc-memo-agent/scripts/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# ChatPerplexity reads PPLX_API_KEY.
PPLX_API_KEY=pplx-...

# LangSmith — tracing is on from the start so every node's tool calls and
# token usage are captured end-to-end.
LANGSMITH_API_KEY=ls__...
LANGSMITH_TRACING=true

# Provider comparison only
PARALLEL_API_KEY=...
EXA_API_KEY=...
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .main import main


if __name__ == "__main__":
main()
48 changes: 48 additions & 0 deletions docs/articles/langchain-vc-memo-agent/scripts/memo/compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import asyncio
from langsmith import Client
from langsmith.evaluation import evaluate

from .eval_dataset import EVAL_COMPANIES
from .evaluators import primary_source_rate, financial_concept_coverage
from .profiles import PROFILES, ProviderProfile

DATASET_NAME = "vc-memo-eval-v2"
client = Client()


def upload_dataset() -> None:
"""Create the LangSmith eval dataset from EVAL_COMPANIES if it doesn't already exist."""
if any(d.name == DATASET_NAME for d in client.list_datasets()):
return
dataset = client.create_dataset(DATASET_NAME, description="VC memo evaluation")
for company in EVAL_COMPANIES:
client.create_example(
inputs={"company": company},
outputs={},
dataset_id=dataset.id,
)


async def _run_one(company: str, profile: ProviderProfile) -> dict:
"""Run the memo graph for one company under the given provider profile."""
graph = profile.build_graph()
final = await graph.ainvoke({"company": company, "research_output": {}, "memo": ""})
return {"memo_md": final["memo"]}


def main() -> None:
"""Upload the dataset and run a LangSmith evaluation for each provider profile."""
upload_dataset()
for name in ("perplexity", "parallel", "exa"):
profile = PROFILES[name]
evaluate(
lambda inputs, profile=profile: asyncio.run(_run_one(inputs["company"], profile)),
data=DATASET_NAME,
evaluators=[primary_source_rate, financial_concept_coverage],
experiment_prefix=f"memo-{name}",
max_concurrency=2,
)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# A mix of public and private companies across sectors. The evaluators below
# are search-quality metrics — no per-company ground truth required, so extend
# this with your own targets freely.
EVAL_COMPANIES = [
"Anduril", "Arm Holdings", "Cohere", "CrowdStrike", "Klaviyo",
"Mistral AI", "Palantir", "Perplexity", "Reddit", "xAI",
]
61 changes: 61 additions & 0 deletions docs/articles/langchain-vc-memo-agent/scripts/memo/evaluators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import re
from urllib.parse import urlparse
from langsmith.evaluation import EvaluationResult, run_evaluator

URL_RE = re.compile(r"https?://\S+")

PRIMARY_HOST_RE = re.compile(
r"(^|\.)(investors?|ir|investorrelations?|press|news|newsroom|press-?releases?|media)\.",
re.IGNORECASE,
)
PRIMARY_DOMAINS = {"sec.gov", "edgar.sec.gov", "businesswire.com",
"prnewswire.com", "globenewswire.com"}
AGGREGATOR_DOMAINS = {"en.wikipedia.org", "crunchbase.com", "pitchbook.com",
"simplywall.st", "stockanalysis.com", "finance.yahoo.com",
"reddit.com", "medium.com", "macrotrends.net"}


def _classify(url: str, company: str) -> str:
"""Classify a cited URL as primary, aggregator, or neutral source."""
host = urlparse(url).netloc.lower()
if host in PRIMARY_DOMAINS or PRIMARY_HOST_RE.search(host):
return "primary"
co_words = [w.lower() for w in company.split() if len(w) > 3]
if any(w in host for w in co_words):
return "primary" # Company's own domain counts as primary
if host in AGGREGATOR_DOMAINS:
return "aggregator"
return "neutral"


@run_evaluator
def primary_source_rate(run, example) -> EvaluationResult:
"""Share of citations from primary sources (IR pages, SEC, official press)
rather than aggregators (Wikipedia, Crunchbase). Neutral domains are
excluded from the ratio."""
memo = run.outputs["memo_md"]
company = example.inputs["company"]
urls = [u.rstrip(".,)") for u in URL_RE.findall(memo)]
primary = sum(1 for u in urls if _classify(u, company) == "primary")
aggregator = sum(1 for u in urls if _classify(u, company) == "aggregator")
denom = primary + aggregator
score = primary / denom if denom else None
return EvaluationResult(key="primary_source_rate", score=score)


@run_evaluator
def financial_concept_coverage(run, example) -> EvaluationResult:
"""Of four financial concepts (valuation, revenue/ARR, funding, operating
metrics), how many appear in the Financials section?"""
memo = run.outputs["memo_md"]
m = re.search(r"##.*Financials.*?\n(.*?)(?=\n##\s|\Z)", memo, re.DOTALL | re.IGNORECASE)
if not m:
return EvaluationResult(key="financial_concept_coverage", score=0.0)
body = m.group(1).lower()
hits = sum([
bool(re.search(r"valuation|valued at|market cap|post-money|pre-money", body)),
bool(re.search(r"revenue|arr |annual recurring|run.?rate", body)),
bool(re.search(r"raised|series [a-h]|funding round|total funding", body)),
bool(re.search(r"gross margin|operating margin|cash position|growth|customers|headcount|employees", body)),
])
return EvaluationResult(key="financial_concept_coverage", score=hits / 4)
196 changes: 196 additions & 0 deletions docs/articles/langchain-vc-memo-agent/scripts/memo/graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# NOTE: The inline code blocks in the cookbook guide
# (docs/cookbook/articles/langchain-vc-memo-agent/README.mdx in ppl-ai/api-docs)
# are verbatim slices of this file. When editing either, update both in the
# same change.
from __future__ import annotations

from datetime import datetime, timezone
from typing import Annotated, Any, TypedDict

from langchain_core.messages import AIMessage
from langchain_perplexity import ChatPerplexity
from langgraph.graph import END, START, StateGraph


def merge_research_output(left: dict[str, str], right: dict[str, str]) -> dict[str, str]:
"""Each research node returns {"<section>": "..."}; merge into one dict."""
return {**(left or {}), **(right or {})}


class MemoState(TypedDict):
company: str
research_output: Annotated[dict[str, str], merge_research_output]
memo: str


SUBNODE_MODEL_NAME = "openai/gpt-5.5"
SYNTHESIZER_MODEL_NAME = "openai/gpt-5.5"


def _agent_model(model: str) -> ChatPerplexity:
"""Build a ChatPerplexity client wired to the Responses API."""
# The Responses (Agent) API ignores sampling params like temperature, so we omit it.
return ChatPerplexity(model=model, use_responses_api=True)


SUBNODE_MODEL = _agent_model(SUBNODE_MODEL_NAME)
SYNTHESIZER_MODEL = _agent_model(SYNTHESIZER_MODEL_NAME)


# Per-research-node tool specs.
TEAM_TOOLS = [{
"type": "web_search",
"filters": {"search_recency_filter": "year"},
}]

PRODUCT_TOOLS = [{"type": "web_search"}]

MARKET_TOOLS = [{"type": "web_search"}]

FINANCIALS_TOOLS = [{"type": "finance_search"}, {"type": "web_search"}]


# Per-research-node max_steps caps the Perplexity Agent API's internal search loop.
RESEARCH_MAX_STEPS = {
"team": 2, "financials": 5, "product": 2, "market": 2,
}


RESEARCH_PROMPT = """You are a VC analyst writing the {section} section of the research output for {company}.

{guidance}

Return a markdown section, then end the document with a "### Citations" header \
followed by a markdown list of:

- <url> — one-sentence evidence quoted from the source

Cite only URLs that came back from your tool calls; never fabricate URLs. \
Keep the section focused — 250-400 words is appropriate for the body."""


GUIDANCE = {
"team": (
"Search for the founders, CEO, and other named executives. Capture each "
"leader's prior roles and education. Prioritize the company's own About/Team "
"page and professional-network sources."
),
"financials": (
"If the company is public, use finance_search for revenue, margins, and analyst "
"estimates. If private, use web_search for funding rounds, valuation, and "
"disclosed revenue. Cross-check structured data against recent news."
),
"product": (
"Describe the company's flagship product, recent launches, and technical "
"differentiators. Cite the company's own product or engineering pages where "
"possible, plus tech-press coverage for context."
),
"market": (
"Map the competitive landscape, name direct competitors, and surface market "
"sizing. Your web_search is scoped to analyst and trade-press sources."
),
}


def _run_research(
state: MemoState,
*,
section: str,
tools: list[dict[str, Any]],
max_steps: int,
) -> dict[str, dict[str, str]]:
"""Run one research section with the given tools and return its output."""
msg: AIMessage = SUBNODE_MODEL.invoke(
[
{"role": "system", "content": RESEARCH_PROMPT.format(
section=section, company=state["company"], guidance=GUIDANCE[section],
)},
{"role": "user", "content": f"Research the {section} of {state['company']}."},
],
tools=tools,
extra_body={"max_steps": max_steps},
)
return {"research_output": {section: msg.content}}


def team_node(state):
"""Research the founders and leadership team."""
return _run_research(state, section="team",
tools=TEAM_TOOLS, max_steps=RESEARCH_MAX_STEPS["team"])

def financials_node(state):
"""Research revenue, funding, and financial metrics."""
return _run_research(state, section="financials",
tools=FINANCIALS_TOOLS, max_steps=RESEARCH_MAX_STEPS["financials"])

def product_node(state):
"""Research the product, launches, and technical differentiators."""
return _run_research(state, section="product",
tools=PRODUCT_TOOLS, max_steps=RESEARCH_MAX_STEPS["product"])

def market_node(state):
"""Research the competitive landscape and market sizing."""
return _run_research(state, section="market",
tools=MARKET_TOOLS, max_steps=RESEARCH_MAX_STEPS["market"])


SYNTH_PROMPT = """You are a senior VC partner writing the final memo for {company}.

You may only cite evidence that appears in the research outputs below. You have no \
tools; do not browse or fabricate sources.

Produce a markdown memo with these seven sections, in order:

1. Snapshot — what the company is, founded, valuation, positioning (3-4 sentences)
2. Team — founders, leadership, recent senior hires
3. Financials — revenue, growth, funding history, comparables
4. Product — what they sell, technology, distribution
5. Market — TAM, direct competitors, category dynamics
6. Risks — top 3-5 risks with brief reasoning
7. Thesis — 1-2 paragraphs of analysis, ending with a single line:
"Recommendation: <PASS | TRACK | ADVANCE | LEAD>"

Each section's H2 heading must be exactly `## <N> · <Section Name>` \
(e.g. `## 1 · Snapshot`), using a middle-dot separator — the evaluator depends \
on this format.

Each of sections 1-6 must end with a `### Citations` subsection listing the \
<url> — <evidence> pairs drawn from the research outputs. Section 7 (Thesis) does \
not need its own citations.

If a research output lacks evidence for a section, write "Insufficient evidence in \
research outputs." in that section's body instead of guessing."""


def synthesizer_node(state: MemoState) -> dict[str, str]:
"""Combine all research outputs into the final memo. No tools attached."""
research_output_block = "\n\n".join(
f"## Research output: {name}\n\n{body}"
for name, body in sorted(state["research_output"].items())
)
msg: AIMessage = SYNTHESIZER_MODEL.invoke([
{"role": "system", "content": SYNTH_PROMPT.format(company=state["company"])},
{"role": "user", "content": (
f"Company: {state['company']}\n"
f"As-of: {datetime.now(timezone.utc).isoformat(timespec='seconds')}\n\n"
f"Research outputs:\n\n{research_output_block}"
)},
])
return {"memo": msg.content}


def build_graph():
"""Wire the four research nodes in parallel from START into the synthesizer, then END."""
g = StateGraph(MemoState)
g.add_node("team", team_node)
g.add_node("financials", financials_node)
g.add_node("product", product_node)
g.add_node("market", market_node)
g.add_node("synthesizer", synthesizer_node)

for section in ("team", "financials", "product", "market"):
g.add_edge(START, section)
g.add_edge(section, "synthesizer")

g.add_edge("synthesizer", END)
return g.compile()
23 changes: 23 additions & 0 deletions docs/articles/langchain-vc-memo-agent/scripts/memo/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import argparse
import asyncio

from .graph import build_graph


async def run_memo(company: str) -> str:
"""Run the full memo agent for one company and return the final markdown memo."""
graph = build_graph()
final = await graph.ainvoke({"company": company, "research_output": {}, "memo": ""})
return final["memo"]


def main() -> None:
"""CLI entrypoint: parse `--company` and print the generated memo."""
parser = argparse.ArgumentParser(description="VC investment memo agent.")
parser.add_argument("--company", required=True)
args = parser.parse_args()
print(asyncio.run(run_memo(args.company)))


if __name__ == "__main__":
main()
Loading
Loading