关联知识库:LlamaIndex API Example
Reader and Query Engine
documents = SimpleDirectoryReader('files').load_data()
response = query_engine.query("summarize each document in a few sentences")
doc and textnode and index
doc = Document(
text=text,
metadata={'author': 'John Doe','category': 'others'},
id_='1'
)
nodes = [
TextNode(
text="Lionel Messi is a football player from Argentina."
),
TextNode(
text="He has won the Ballon d'Or trophy 7 times."
),
TextNode(text="Lionel Messi's hometown is Rosario."),
TextNode(text="He was born on June 24, 1987.")
]
index = SummaryIndex(nodes)
replace llm
from llama_index.core.settings import Settings
Settings.llm = OpenAI(temperature=0.8, model="gpt-4")
Import Pipeline 4 cache
from llama_index.core import SimpleDirectoryReader
from llama_index.core.extractors import SummaryExtractor,QuestionsAnsweredExtractor
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.core.schema import TransformComponent
class CustomTransformation(TransformComponent):
def call(self, nodes, **kwargs):
# run any node transformation logic here
return nodes
reader = SimpleDirectoryReader('files')
documents = reader.load_data()
try:
cached_hashes = IngestionCache.from_persist_path(
"./ingestion_cache.json"
)
print("Cache file found. Running using cache...")
except:
cached_hashes = ""
print("No cache file found. Running without cache...")
pipeline = IngestionPipeline(
transformations = [
CustomTransformation(),
TokenTextSplitter(
separator=" ",
chunk_size=512,
chunk_overlap=128),
SummaryExtractor(),
QuestionsAnsweredExtractor(
questions=3
)
],
cache=cached_hashes
)
nodes = pipeline.run(documents=documents, show_progress=True)
pipeline.cache.persist("./ingestion_cache.json")
print("All documents loaded")
ingest uploaded documents
from global_settings import STORAGE_PATH, INDEX_STORAGE, CACHE_FILE
from logging_functions import log_action
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.extractors import SummaryExtractor
from llama_index.embeddings.openai import OpenAIEmbedding
def ingest_documents():
documents = SimpleDirectoryReader(
STORAGE_PATH,
filename_as_id = True
).load_data()
for doc in documents:
print(doc.id_)
log_action(
f"File '{doc.id_}' uploaded user",
action_type="UPLOAD"
)
try: cached_hashes = IngestionCache.from_persist_path(CACHE_FILE)print("Cache file found. Running using cache...")
except:cached_hashes = ""print("No cache file found. Running without cache...")
pipeline = IngestionPipeline(transformations=[TokenTextSplitter(chunk_size=1024, chunk_overlap=20),SummaryExtractor(summaries=['self']),OpenAIEmbedding()],cache=cached_hashes
)nodes = pipeline.run(documents=documents)
pipeline.cache.persist(CACHE_FILE)return nodes
if name == "main":
embedded_nodes = ingest_documents()
VectorStoreIndex
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("files").load_data()
index = VectorStoreIndex.from_documents(documents)
print("Index created successfully!")
use local embedding
或者集成LangChain,使用其提供的嵌入模型
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embedding_model = HuggingFaceEmbedding(
model_name="WhereIsAI/UAE-Large-V1"
)
embeddings = embedding_model.get_text_embedding(
"The quick brown fox jumps over the lazy cat!"
)
print(embeddings[:15])
Index持久化 from 内存 to 磁盘
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)
index.storage_context.persist(persist_dir="index_cache")
print("Index persisted to disk.")
reload index
from llama_index.core import StorageContext, load_index_from_storage
storage_context = StorageContext.from_defaults(persist_dir="index_cache")
index = load_index_from_storage(storage_context)
print("Index loaded successfully!")
use chromadb
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
db = chromadb.PersistentClient(path="chroma_database")
chroma_collection = db.get_or_create_collection(
"my_chroma_store"
)
vector_store = ChromaVectorStore(
chroma_collection=chroma_collection
)
storage_context = StorageContext.from_defaults(
vector_store=vector_store
)
documents = SimpleDirectoryReader("files").load_data()
index = VectorStoreIndex.from_documents(
documents=documents,
storage_context=storage_context
)
the following part displays the entire contents of the ChromaDB collection
results = chroma_collection.get()
print(results)
''' We can use the next part to rebuild the Index from the ChromaDB in future sessions
index = VectorStoreIndex.from_vector_store(
vector_store=vector_store,
storage_context=storage_context
)
'''
Index class
from llama_index.core import SummaryIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("files").load_data()
index = SummaryIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("How many documents have you loaded?")
print(response)
from llama_index.core import TreeIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("files").load_data()
index = TreeIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("Tell me about dogs")
print(response)
from llama_index.core import ComposableGraph, SimpleDirectoryReader, TreeIndex, SummaryIndex
documents = SimpleDirectoryReader("files").load_data()
index1 = TreeIndex.from_documents([documents[0]])
index2 = TreeIndex.from_documents([documents[1]])
summary1 = "A short introduction to ancient Rome"
summary2 = "Some facts about dogs"
graph = ComposableGraph.from_indices(
SummaryIndex, [index1, index2],
index_summaries=[summary1, summary2]
)
query_engine = graph.as_query_engine()
response = query_engine.query("What can you tell me?")
print(response)
from llama_index.core import DocumentSummaryIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("files").load_data()
index = DocumentSummaryIndex.from_documents(
documents,
show_progress=True
)
summary1 = index.get_document_summary(documents[0].doc_id)
summary2 = index.get_document_summary(documents[1].doc_id)
print("\nSummary of the first document:" + summary1)
print("\nSummary of the second document:" + summary2)
from llama_index.core import KeywordTableIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("files").load_data()
index = KeywordTableIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("What famous buildings were in ancient Rome?")
print(response)
from llama_index.core import KnowledgeGraphIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("files").load_data()
index = KnowledgeGraphIndex.from_documents(documents, max_triplets_per_chunk=2, use_async=True)
query_engine = index.as_query_engine()
response = query_engine.query("Tell me about dogs.")
print(response)
token mock test
import tiktoken
from llama_index.core import TreeIndex, SimpleDirectoryReader, Settings
from llama_index.core.llms.mock import MockLLM
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
llm = MockLLM(max_tokens=256)
token_counter = TokenCountingHandler(
tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)
callback_manager = CallbackManager([token_counter])
Settings.callback_manager=callback_manager
Settings.llm=llm
documents = SimpleDirectoryReader("cost_prediction_samples").load_data()
index = TreeIndex.from_documents(
documents=documents,
num_children=2,
show_progress=True)
print("Total LLM Token Count:", token_counter.total_llm_token_count)
import tiktoken
from llama_index.core import MockEmbedding, VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
from llama_index.core.llms.mock import MockLLM
embed_model = MockEmbedding(embed_dim=1536)
llm = MockLLM(max_tokens=256)
token_counter = TokenCountingHandler(
tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)
callback_manager = CallbackManager([token_counter])
Settings.embed_model=embed_model
Settings.llm=llm
Settings.callback_manager=callback_manager
documents = SimpleDirectoryReader("cost_prediction_samples").load_data()
index = VectorStoreIndex.from_documents(
documents=documents,
show_progress=True)
print("Embedding Token Count:", token_counter.total_embedding_token_count)
query_engine = index.as_query_engine()
response = query_engine.query("What's the cat's name?")
print("Query LLM Token Count:", token_counter.total_llm_token_count)
print("Query Embedding Token Count:",token_counter.total_embedding_token_count)
PITS - index_builder.py
from llama_index.core import VectorStoreIndex, TreeIndex, load_index_from_storage
from llama_index.core import StorageContext
from global_settings import INDEX_STORAGE
from document_uploader import ingest_documents
def build_indexes(nodes):
try:
storage_context = StorageContext.from_defaults(
persist_dir=INDEX_STORAGE
)
vector_index = load_index_from_storage(
storage_context, index_id="vector"
)
tree_index = load_index_from_storage(
storage_context, index_id="tree"
)
print("All indices loaded from storage.")
except Exception as e:
print(f"Error occurred while loading indices: {e}")
storage_context = StorageContext.from_defaults()
vector_index = VectorStoreIndex(
nodes, storage_context=storage_context
)
vector_index.set_index_id("vector")
tree_index = TreeIndex(
nodes, storage_context=storage_context
)
tree_index.set_index_id("tree")
storage_context.persist(
persist_dir=INDEX_STORAGE
)
print("New indexes created and persisted.")
return vector_index, tree_index
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/937949.shtml
如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!