我想从message:response中提取名词,并在控制台中显示或打印它们,如何做到这一点?我厌倦了ntlk和textbob之类的东西,但我不知道如何正确实现它。我甚至问了github copoilet,但它只是随机显示,不工作。我有一个任务,涉及从一个名为message的变量中提取名词:response.我想在控制台中显示提取的名词或将它们打印在屏幕上。如何使用Python完成此任务?我试过使用一些库,比如NLTK和TextBlob,但我不确定如何正确使用它们。我也向GitHub Copilot寻求帮助,但是它没有生成任何有用的代码。2它只是显示了一些不起作用的随机输出。3有人能帮我解决这个问题吗?
from flask import Flask , render_template , jsonify ,request
from flask_cors import CORS
from textblob import TextBlob
import requests , openai , os
from langchain.llms import OpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationSummaryBufferMemory
from datasets import load_dataset , Dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
from getpass import getpass
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from tqdm.auto import tqdm
from uuid import uuid4
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain
#flask
llm=OpenAI()
memory=ConversationSummaryBufferMemory(llm=llm,max_token_limit=100)
app=Flask(__name__)
CORS(app)
#pinecone
trust_remote_code=True
data = load_dataset("wikipedia", "20220301.simple", split='train[:1000]')
data[6]
tokenizer = tiktoken.get_encoding('cl100k_base')
# create the length function
def tiktoken_len(text):
tokens = tokenizer.encode(
text,
disallowed_special=()
)
return len(tokens)
tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
"we can find the length of this chunk of text in tokens")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=20,
length_function=tiktoken_len,
separators=["\n\n", "\n", " ", ""]
)
chunks = text_splitter.split_text(data[6]['text'])[:3]
tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2])
OPENAI_API_KEY = "API KEY"
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(
model=model_name,
openai_api_key=OPENAI_API_KEY
)
texts = [
'this is the first chunk of text',
'then another second chunk of text is here'
]
res = embed.embed_documents(texts)
len(res), len(res[0])
# find API key in console at app.pinecone.io
YOUR_API_KEY = "API KEY"
# find ENV (cloud region) next to API key in console
YOUR_ENV = "gcp-starter"
index_name = 'langchain-retrieval-augmentation'
pinecone.init(
api_key='API KEY',
environment='gcp-starter'
)
index = pinecone.GRPCIndex(index_name)
batch_limit = 100
texts = []
metadatas = []
if len(texts) > 0:
ids = [str(uuid4()) for _ in range(len(texts))]
embeds = embed.embed_documents(texts)
index.upsert(vectors=zip(ids, embeds, metadatas))
text_field = "text"
# switch back to normal index for langchain
index = pinecone.Index(index_name)
vectorstore = Pinecone(
index, embed.embed_query, text_field
)
query="What is the capital of France?"
vectorstore.similarity_search(
query, # our search query
k=3 # return 3 most relevant docs
)
# completion llm
llm = ChatOpenAI(
openai_api_key='API KEY',
model_name='gpt-3.5-turbo',
temperature=0.0
)
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever()
)
qa_with_sources(query)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/data',methods=['POST'])
def getdata():
data=request.get_json()
text=data.get('data')
user_input=text
try:
response=qa_with_sources(user_input)
return jsonify({"message":response,"response":True})
except Exception as e:
print(e)
error_message=f'Error:{str (e)}'
return jsonify({"message":error_message,"response":False})
if __name__=='__main__':
app.run(debug=True)
#I want to extract the nouns from message: response and display it in console
字符串
1条答案
按热度按时间2w3rbyxf1#
提取名词所需要的是NLTK中的POS(词性)标记。
下面是简单的步骤。
1.把这一段分成句子。
1.把句子分解成符号
1.标记token(产生元组列表,其中元组的第一部分是单词,元组的第二部分是标记)
字符串
示例输出如下所示
型
任何第二部分以NN开头的元组(单词)都是名词。