跳转至

新闻摘要

此示例通过以下一系列步骤进行:

  1. 您选择一个主题领域(例如,“新闻”、“Nvidia”、“音乐”等)。
  2. 从各种来源获取该主题的最新文章。
  3. 使用Ollama对每篇文章进行摘要。
  4. 从每篇文章中创建句子块。
  5. 使用句子转换器为每个句子块生成嵌入。
  6. 您输入一个关于所显示摘要的问题。
  7. 使用句子转换器为该问题生成嵌入。
  8. 使用嵌入的问题找到最相似的句子块。
  9. 将所有这些信息提供给Ollama,基于这些新闻文章生成对您的问题的良好回答。

此示例让您从几个不同的主题领域中选择,然后为该主题总结最新的x篇文章。接着,从每篇文章中创建句子块,并为这些句子块生成嵌入。

运行示例

1. 确保您已安装 mistral-openorca 模型:

ollama pull mistral-openorca

2. 安装Python所需库。

pip install -r requirements.txt

3. 运行示例:

python summ.py

源码

summ.py

import curses
import json
from utils import get_url_for_topic, topic_urls, menu, getUrls, get_summary, getArticleText, knn_search
import requests
from sentence_transformers import SentenceTransformer
from mattsollamatools import chunker

if __name__ == "__main__":
    chosen_topic = curses.wrapper(menu)
    print("Here is your news summary:\n")
    urls = getUrls(chosen_topic, n=5)
    model = SentenceTransformer('all-MiniLM-L6-v2')
    allEmbeddings = []

    for url in urls:
      article={}
      article['embeddings'] = []
      article['url'] = url
      text = getArticleText(url)
      summary = get_summary(text)
      chunks = chunker(text)  # Use the chunk_text function from web_utils
      embeddings = model.encode(chunks)
      for (chunk, embedding) in zip(chunks, embeddings):
        item = {}
        item['source'] = chunk
        item['embedding'] = embedding.tolist()  # Convert NumPy array to list
        item['sourcelength'] = len(chunk)
        article['embeddings'].append(item)

      allEmbeddings.append(article)

      print(f"{summary}\n")


    while True:
      context = []
      # Input a question from the user
      question = input("Enter your question about the news, or type quit: ")

      if question.lower() == 'quit':
        break

      # Embed the user's question
      question_embedding = model.encode([question])

      # Perform KNN search to find the best matches (indices and source text)
      best_matches = knn_search(question_embedding, allEmbeddings, k=10)


      sourcetext=""
      for i, (index, source_text) in enumerate(best_matches, start=1):
          sourcetext += f"{i}. Index: {index}, Source Text: {source_text}"

      systemPrompt = f"Only use the following information to answer the question. Do not use anything else: {sourcetext}"

      url = "http://localhost:11434/api/generate"

      payload = {
      "model": "mistral-openorca",
      "prompt": question,
      "system": systemPrompt,
      "stream": False,
      "context": context
      }

      # Convert the payload to a JSON string
      payload_json = json.dumps(payload)

      # Set the headers to specify JSON content
      headers = {
          "Content-Type": "application/json"
      }

      # Send the POST request
      response = requests.post(url, data=payload_json, headers=headers)

      # Check the response
      if response.status_code == 200:
          output = json.loads(response.text)
          context = output['context']
          print(output['response']+ "\n")


      else:
          print(f"Request failed with status code {response.status_code}")

utils.py

import curses
import feedparser
import requests
import unicodedata
import json
from newspaper import Article
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
from sklearn.neighbors import NearestNeighbors
from mattsollamatools import chunker

# Create a dictionary to store topics and their URLs
topic_urls = {
    "Mac": "https://9to5mac.com/guides/mac/feed",
    "News": "http://www.npr.org/rss/rss.php?id=1001",
    "Nvidia": "https://nvidianews.nvidia.com/releases.xml",
    "Raspberry Pi": "https://www.raspberrypi.com/news/feed/",
    "Music": "https://www.billboard.com/c/music/music-news/feed/"
}

# Use curses to create a menu of topics
def menu(stdscr):
    chosen_topic = get_url_for_topic(stdscr)
    url = topic_urls[chosen_topic] if chosen_topic in topic_urls else "Topic not found"

    stdscr.addstr(len(topic_urls) + 3, 0, f"Selected URL for {chosen_topic}: {url}")
    stdscr.refresh()

    return chosen_topic

# You have chosen a topic. Now return the url for that topic
def get_url_for_topic(stdscr):
    curses.curs_set(0)  # Hide the cursor
    stdscr.clear()

    stdscr.addstr(0, 0, "Choose a topic using the arrow keys (Press Enter to select):")

    # Create a list of topics
    topics = list(topic_urls.keys())
    current_topic = 0

    while True:
        for i, topic in enumerate(topics):
            if i == current_topic:
                stdscr.addstr(i + 2, 2, f"> {topic}")
            else:
                stdscr.addstr(i + 2, 2, f"  {topic}")

        stdscr.refresh()

        key = stdscr.getch()

        if key == curses.KEY_DOWN and current_topic < len(topics) - 1:
            current_topic += 1
        elif key == curses.KEY_UP and current_topic > 0:
            current_topic -= 1
        elif key == 10:  # Enter key
            return topic_urls[topics[current_topic]]

# Get the last N URLs from an RSS feed
def getUrls(feed_url, n=20):
    feed = feedparser.parse(feed_url)
    entries = feed.entries[-n:]
    urls = [entry.link for entry in entries]
    return urls

# Often there are a bunch of ads and menus on pages for a news article. This uses newspaper3k to get just the text of just the article.
def getArticleText(url):
  article = Article(url)
  article.download()
  article.parse()
  return article.text

def get_summary(text):
  systemPrompt = "Write a concise summary of the text, return your responses with 5 lines that cover the key points of the text given."
  prompt = text

  url = "http://localhost:11434/api/generate"

  payload = {
    "model": "mistral-openorca",
    "prompt": prompt,
    "system": systemPrompt,
    "stream": False
  }
  payload_json = json.dumps(payload)
  headers = {"Content-Type": "application/json"}
  response = requests.post(url, data=payload_json, headers=headers)

  return json.loads(response.text)["response"]

# Perform K-nearest neighbors (KNN) search
def knn_search(question_embedding, embeddings, k=5):
    X = np.array([item['embedding'] for article in embeddings for item in article['embeddings']])
    source_texts = [item['source'] for article in embeddings for item in article['embeddings']]

    # Fit a KNN model on the embeddings
    knn = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn.fit(X)

    # Find the indices and distances of the k-nearest neighbors
    distances, indices = knn.kneighbors(question_embedding, n_neighbors=k)

    # Get the indices and source texts of the best matches
    best_matches = [(indices[0][i], source_texts[indices[0][i]]) for i in range(k)]

    return best_matches

源码仓库

python-rag-newssummary