如何使用Python和Langchain从响应中提取名词

pcww981p  于 5个月前  发布在  Python
关注(0)|答案(1)|浏览(66)

我想从message:response中提取名词,并在控制台中显示或打印它们,如何做到这一点?我厌倦了ntlk和textbob之类的东西,但我不知道如何正确实现它。我甚至问了github copoilet,但它只是随机显示,不工作。我有一个任务,涉及从一个名为message的变量中提取名词:response.我想在控制台中显示提取的名词或将它们打印在屏幕上。如何使用Python完成此任务?我试过使用一些库,比如NLTK和TextBlob,但我不确定如何正确使用它们。我也向GitHub Copilot寻求帮助,但是它没有生成任何有用的代码。2它只是显示了一些不起作用的随机输出。3有人能帮我解决这个问题吗?

from flask import Flask , render_template , jsonify ,request
from flask_cors import CORS
from textblob import TextBlob
import requests , openai , os
from langchain.llms import OpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationSummaryBufferMemory
from datasets import load_dataset , Dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
from getpass import getpass
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from tqdm.auto import tqdm
from uuid import uuid4
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain
#flask 
llm=OpenAI()
memory=ConversationSummaryBufferMemory(llm=llm,max_token_limit=100)
app=Flask(__name__)
CORS(app)

#pinecone
trust_remote_code=True
data = load_dataset("wikipedia", "20220301.simple", split='train[:1000]')
data[6]
tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
            "we can find the length of this chunk of text in tokens")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_text(data[6]['text'])[:3]
tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2])
OPENAI_API_KEY = "API KEY"
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed.embed_documents(texts)
len(res), len(res[0])
# find API key in console at app.pinecone.io
YOUR_API_KEY = "API KEY"
# find ENV (cloud region) next to API key in console
YOUR_ENV = "gcp-starter"

index_name = 'langchain-retrieval-augmentation'
pinecone.init(
    api_key='API KEY',
    environment='gcp-starter'
)
index = pinecone.GRPCIndex(index_name)
batch_limit = 100
texts = []
metadatas = []
if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))
text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)
query="What is the capital of France?"
vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)
# completion llm
llm = ChatOpenAI(
    openai_api_key='API KEY',
    model_name='gpt-3.5-turbo',
    temperature=0.0
)
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)
qa_with_sources(query)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/data',methods=['POST'])
def getdata():
    data=request.get_json()
    text=data.get('data')
    user_input=text
    try:
        response=qa_with_sources(user_input)
        
        return jsonify({"message":response,"response":True})
    except Exception as e:
        print(e)
        error_message=f'Error:{str (e)}'
        return jsonify({"message":error_message,"response":False})
    

if __name__=='__main__':
    app.run(debug=True) 

 

#I want to extract the nouns from message: response and display it in console

字符串

2w3rbyxf

2w3rbyxf1#

提取名词所需要的是NLTK中的POS(词性)标记。
下面是简单的步骤。
1.把这一段分成句子。
1.把句子分解成符号
1.标记token(产生元组列表,其中元组的第一部分是单词,元组的第二部分是标记)

tokenized = sent_tokenize(txt)
for i in tokenized:
    #Word tokenizers is used to find the words 
    #and punctuation in a string
    wordsList = nltk.word_tokenize(i)
    #removing stop words from wordList 
    wordsList = [w for w in wordsList if not w in stop_words] 
 
   #Using a Tagger. Which is part-of-speech 
   #tagger or POS-tagger. 
   tagged = nltk.pos_tag(wordsList)

字符串
示例输出如下所示

('Marriage', 'NN'), ('big', 'JJ'), ('step', 'NN'), ('one', 'CD'), ('’', 'NN'), ('life', 'NN')]


任何第二部分以NN开头的元组(单词)都是名词。

相关问题