Merge pull request #57 from lthomas122/main

rlancemartin · web-flow · commit dfd0215b045d · 2025-03-20T13:41:08.000-07:00
Add SearXNG Support
diff --git a/.env.example b/.env.example
@@ -1,8 +1,12 @@
-OLLAMA_BASE_URL=http://localhost:11434                # the endpoint of the Ollama service, defaults to http://localhost:11434 if not set
-OLLAMA_MODEL=deepseek-r1:1.5b                  # the name of the model to use, defaults to 'llama3.2' if not set
+ 
+OLLAMA_BASE_URL=http://localhost:11434 # the endpoint of the Ollama service, defaults to http://localhost:11434 if not set
+OLLAMA_MODEL=llama3.2 # the name of the model to use, defaults to 'llama3.2' if not set
 
-# Which search service to use, either 'duckduckgo' or 'tavily' or 'perplexity'
+# Which search service to use, either 'duckduckgo', 'tavily', 'perplexity', Searxng
 SEARCH_API='duckduckgo'
+# For Searxng search, defaults to http://localhost:8888
+SEARXNG_URL=
+
 # Web Search API Keys (choose one or both)
 TAVILY_API_KEY=tvly-xxxxx      # Get your key at https://tavily.com
 PERPLEXITY_API_KEY=pplx-xxxxx  # Get your key at https://www.perplexity.ai
diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@ cd ollama-deep-researcher
 
 4. Select a web search tool:
 
-By default, it will use [DuckDuckGo](https://duckduckgo.com/) for web search, which does not require an API key. But you can also use [Tavily](https://tavily.com/) or [Perplexity](https://www.perplexity.ai/hub/blog/introducing-the-sonar-pro-api) by adding their API keys to the environment file:
+By default, it will use [DuckDuckGo](https://duckduckgo.com/) for web search, which does not require an API key. But you can also use [SearXNG](https://docs.searxng.org/), [Tavily](https://tavily.com/) or [Perplexity](https://www.perplexity.ai/hub/blog/introducing-the-sonar-pro-api) by adding their API keys to the environment file:
 ```bash
 cp .env.example .env
 ```
diff --git a/src/assistant/configuration.py b/src/assistant/configuration.py
@@ -11,6 +11,7 @@ class SearchAPI(Enum):
     PERPLEXITY = "perplexity"
     TAVILY = "tavily"
     DUCKDUCKGO = "duckduckgo"
+    SEARXNG = "searxng"
 
 @dataclass(kw_only=True)
 class Configuration:
diff --git a/src/assistant/graph.py b/src/assistant/graph.py
@@ -8,7 +8,7 @@
 from langgraph.graph import START, END, StateGraph
 
 from assistant.configuration import Configuration, SearchAPI
-from assistant.utils import deduplicate_and_format_sources, tavily_search, format_sources, perplexity_search, duckduckgo_search
+from assistant.utils import deduplicate_and_format_sources, tavily_search, format_sources, perplexity_search, duckduckgo_search, searxng_search
 from assistant.state import SummaryState, SummaryStateInput, SummaryStateOutput
 from assistant.prompts import query_writer_instructions, summarizer_instructions, reflection_instructions
 
@@ -54,6 +54,9 @@ def web_research(state: SummaryState, config: RunnableConfig):
     elif search_api == "duckduckgo":
         search_results = duckduckgo_search(state.search_query, max_results=3, fetch_full_page=configurable.fetch_full_page)
         search_str = deduplicate_and_format_sources(search_results, max_tokens_per_source=1000, include_raw_content=True)
+    elif search_api == "searxng":
+        search_results = searxng_search(state.search_query, max_results=3, fetch_full_page=configurable.fetch_full_page)
+        search_str = deduplicate_and_format_sources(search_results, max_tokens_per_source=1000, include_raw_content=False)
     else:
         raise ValueError(f"Unsupported search API: {configurable.search_api}")
 
diff --git a/src/assistant/utils.py b/src/assistant/utils.py
@@ -4,6 +4,7 @@
 from langsmith import traceable
 from tavily import TavilyClient
 from duckduckgo_search import DDGS
+from langchain_community.utilities import SearxSearchWrapper
 
 def deduplicate_and_format_sources(search_response, max_tokens_per_source, include_raw_content=False):
     """
@@ -71,6 +72,18 @@ def format_sources(search_results):
         f"* {source['title']} : {source['url']}"
         for source in search_results['results']
     )
+def fetch_raw_content(url):
+    try:
+        # Try to fetch the full page content using curl
+        import urllib.request
+        from bs4 import BeautifulSoup
+        response = urllib.request.urlopen(url)
+        html = response.read()
+        soup = BeautifulSoup(html, 'html.parser')
+        return soup.get_text()
+        
+    except Exception as e:
+        print(f"Warning: Failed to fetch full page content for {url}: {str(e)}")
 
 @traceable
 def duckduckgo_search(query: str, max_results: int = 3, fetch_full_page: bool = False) -> Dict[str, List[Dict[str, str]]]:
@@ -104,18 +117,7 @@ def duckduckgo_search(query: str, max_results: int = 3, fetch_full_page: bool =
 
                 raw_content = content
                 if fetch_full_page:
-                    try:
-                        # Try to fetch the full page content using curl
-                        import urllib.request
-                        from bs4 import BeautifulSoup
-
-                        response = urllib.request.urlopen(url)
-                        html = response.read()
-                        soup = BeautifulSoup(html, 'html.parser')
-                        raw_content = soup.get_text()
-                        
-                    except Exception as e:
-                        print(f"Warning: Failed to fetch full page content for {url}: {str(e)}")
+                    raw_content = fetch_raw_content(url)
                 
                 # Add result to list
                 result = {
@@ -132,6 +134,52 @@ def duckduckgo_search(query: str, max_results: int = 3, fetch_full_page: bool =
         print(f"Full error details: {type(e).__name__}")
         return {"results": []}
 
+@traceable
+def searxng_search(query: str, max_results: int = 3, fetch_full_page = False):
+    """Search the web using SearXNG.
+    
+    Args:
+        query (str): The search query to execute
+        max_results (int): Maximum number of results to return
+        fetch_full_page: Fetch page content from results urls
+        
+    Returns:
+        dict: Search response containing:
+            - results (list): List of search result dictionaries, each containing:
+                - title (str): Title of the search result
+                - url (str): URL of the search result
+                - content (str): Snippet/summary of the content
+                - raw_content (str): Same as content since SearXNG doesn't provide full page content, unless fetch_full_page is true
+    """
+    host=os.environ.get("SEARXNG_URL", "http://localhost:8888")
+    s = SearxSearchWrapper(searx_host=host)
+
+    results = []
+    search_results = s.results(query, num_results=max_results)
+    for r in search_results:
+        url = r.get('link')
+        title = r.get('title')
+        content = r.get('snippet')
+        
+        if not all([url, title, content]):
+            print(f"Warning: Incomplete result from SearXNG: {r}")
+            continue
+
+        raw_content = content
+        if fetch_full_page:
+            raw_content = fetch_raw_content(url)
+        
+        # Add result to list
+        result = {
+            "title": title,
+            "url": url,
+            "content": content,
+            "raw_content": raw_content
+        }
+        results.append(result)
+    return {"results": results}
+    
+
 @traceable
 def tavily_search(query, include_raw_content=True, max_results=3):
     """ Search the web using the Tavily API.