Change recomendation strategy
This commit is contained in:
parent
7cce1cdc04
commit
df33ce79bb
@ -9,6 +9,7 @@ dependencies = [
|
||||
"django>=5.2.7",
|
||||
"gunicorn>=23.0.0",
|
||||
"langchain>=0.3.27",
|
||||
"langchain-community>=0.4.1",
|
||||
"langchain-openai>=0.3.35",
|
||||
"langchain-qdrant>=1.1.0",
|
||||
"langgraph-checkpoint-postgres>=3.0.0",
|
||||
|
||||
@ -1,19 +1,18 @@
|
||||
import traceback
|
||||
from itertools import batched
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from django.core.management import BaseCommand
|
||||
from django.conf import settings
|
||||
import clickhouse_connect
|
||||
from vacancies.main.vector_store import add_vectors, extract_features, qdrant_client
|
||||
from django.core.management import BaseCommand
|
||||
from qdrant_client.models import OrderBy
|
||||
|
||||
from vacancies.conf.settings import CLICKHOUSE_HOST, CLICKHOUSE_PORT
|
||||
from vacancies.main.vector_store import add_vectors, extract_features, qdrant_client
|
||||
|
||||
clickhouse_client = clickhouse_connect.create_client(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT)
|
||||
|
||||
query = """
|
||||
SELECT id, chat_username, telegram_id, message, timestamp
|
||||
FROM telegram_parser_chatmessage
|
||||
WHERE timestamp >= now() - INTERVAL 30 DAY
|
||||
WHERE timestamp >= %(timestamp)s
|
||||
AND length(message) > 150
|
||||
AND arrayCount(x -> position(message, x) > 0, [
|
||||
'ваканси', 'ищем', 'требуется', 'разработчик', 'будет плюсом',
|
||||
@ -49,24 +48,31 @@ class Command(BaseCommand):
|
||||
next_page_offset = response[1]
|
||||
exist_points_set = tuple(set(exist_points_ids))
|
||||
|
||||
result_rows = clickhouse_client.query(query, parameters={"exist_points": exist_points_set}).result_rows
|
||||
with ThreadPoolExecutor(max_workers=settings.COLLECT_VACANCIES_BATCH_SIZE) as pool:
|
||||
pool.map(self._process_batch, batched(result_rows, settings.COLLECT_VACANCIES_BATCH_SIZE))
|
||||
response = qdrant_client.scroll(
|
||||
collection_name="vacancies",
|
||||
limit=1,
|
||||
order_by=OrderBy(
|
||||
key="timestamp",
|
||||
direction="desc",
|
||||
),
|
||||
)
|
||||
last_point_timestamp = datetime.now() - timedelta(days=30)
|
||||
if response:
|
||||
last_point_timestamp = response[0][0].payload["timestamp"]
|
||||
|
||||
def _process_batch(self, result_rows):
|
||||
try:
|
||||
for index, row in enumerate(result_rows):
|
||||
(id, chat_username, telegram_id, message, timestamp) = row
|
||||
result_rows = clickhouse_client.query(
|
||||
query,
|
||||
parameters={"timestamp": last_point_timestamp, "exist_points": exist_points_set},
|
||||
).result_rows
|
||||
|
||||
link = f"https://t.me/{chat_username}/{telegram_id}"
|
||||
print(f"Processing {index+1}/{len(result_rows)} link: {link}")
|
||||
features = extract_features(message)
|
||||
|
||||
add_vectors(
|
||||
"vacancies",
|
||||
id,
|
||||
features.model_dump(),
|
||||
{'content': message, 'features_json': features.model_dump(), "link": link, "timestamp": timestamp},
|
||||
)
|
||||
except Exception as exc:
|
||||
traceback.print_exception(exc)
|
||||
for index, row in enumerate(result_rows):
|
||||
(id, chat_username, telegram_id, message, timestamp) = row
|
||||
link = f"https://t.me/{chat_username}/{telegram_id}"
|
||||
print(f"Processing {index+1}/{len(result_rows)} link: {link}")
|
||||
features = extract_features(message)
|
||||
add_vectors(
|
||||
"vacancies",
|
||||
id,
|
||||
features.model_dump(),
|
||||
{'content': message, 'features_json': features.model_dump(), "link": link, "timestamp": timestamp},
|
||||
)
|
||||
|
||||
@ -1,12 +1,10 @@
|
||||
from qdrant_client import models
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from langchain_community.embeddings import DeepInfraEmbeddings
|
||||
from langchain_openai import ChatOpenAI
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import Filter
|
||||
from vacancies.main.models import VacancyFeatures
|
||||
from qdrant_client import QdrantClient, models
|
||||
from qdrant_client.models import Filter, HasIdCondition
|
||||
|
||||
from vacancies.conf.settings import QDRANT_URL
|
||||
from vacancies.main.models import RecommendedVacancy
|
||||
from qdrant_client.models import HasIdCondition
|
||||
from vacancies.main.models import RecommendedVacancy, VacancyFeatures
|
||||
|
||||
qdrant_client = QdrantClient(url=QDRANT_URL)
|
||||
|
||||
@ -16,37 +14,44 @@ FEATURE_NAMES = [
|
||||
]
|
||||
|
||||
weights = {
|
||||
"job_title": 25,
|
||||
"job_title": 42,
|
||||
"employment_type": 5,
|
||||
"work_format": 5,
|
||||
"experience": 8,
|
||||
"position_level": 12,
|
||||
"industry": 10,
|
||||
"tech_stack": 14,
|
||||
"location": 5,
|
||||
"salary_range": 5,
|
||||
"languages": 5,
|
||||
"position_level": 5,
|
||||
"industry": 1,
|
||||
"tech_stack": 10,
|
||||
"location": 4,
|
||||
"salary_range": 10,
|
||||
"languages": 4,
|
||||
"education": 2,
|
||||
"schedule": 2,
|
||||
"additional_requirements": 2,
|
||||
}
|
||||
|
||||
vectors_config = {
|
||||
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
|
||||
name: models.VectorParams(size=4096, distance=models.Distance.COSINE) for name in FEATURE_NAMES
|
||||
}
|
||||
|
||||
if not qdrant_client.collection_exists("vacancies"):
|
||||
qdrant_client.create_collection(
|
||||
collection_name="vacancies",
|
||||
vectors_config=vectors_config
|
||||
vectors_config=vectors_config,
|
||||
)
|
||||
qdrant_client.create_payload_index(
|
||||
collection_name="vacancies",
|
||||
field_name="timestamp",
|
||||
field_schema="datetime",
|
||||
)
|
||||
if not qdrant_client.collection_exists("cvs"):
|
||||
qdrant_client.create_collection(
|
||||
collection_name="cvs",
|
||||
vectors_config=vectors_config
|
||||
vectors_config=vectors_config,
|
||||
)
|
||||
|
||||
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
|
||||
embedding = DeepInfraEmbeddings(
|
||||
model_id="Qwen/Qwen3-Embedding-8B-batch",
|
||||
)
|
||||
|
||||
def _prepare_texts(features):
|
||||
"""Prepare texts for each feature from features dict."""
|
||||
@ -66,7 +71,7 @@ def add_vectors(collection_name: str, _id: int, features: dict, payload: dict):
|
||||
texts = _prepare_texts(features)
|
||||
vectors = {}
|
||||
for name, text in texts.items():
|
||||
vectors[name] = [0.0] * 3072
|
||||
vectors[name] = [0.0] * 4096
|
||||
if text:
|
||||
vec = embedding.embed_query(text)
|
||||
vectors[name] = vec
|
||||
@ -93,7 +98,7 @@ def add_vectors(collection_name: str, _id: int, features: dict, payload: dict):
|
||||
scored.append({"id": vid, "score": total})
|
||||
|
||||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||||
if scored and scored[0]["score"] > 90: # threshold
|
||||
if scored and scored[0]["score"] > 80: # threshold
|
||||
return
|
||||
|
||||
qdrant_client.upsert(
|
||||
@ -118,48 +123,43 @@ def search_similarities(query_filter: Filter, cv_id: int):
|
||||
max_similarities = {}
|
||||
vacancies_content = {}
|
||||
for name, vec in cv.vector.items():
|
||||
if any(v != 0 for v in vec):
|
||||
results = qdrant_client.query_points(
|
||||
collection_name="vacancies",
|
||||
query=vec,
|
||||
using=name,
|
||||
limit=100,
|
||||
with_payload=True,
|
||||
query_filter=query_filter,
|
||||
)
|
||||
for res in results.points:
|
||||
vid = res.id
|
||||
sim = res.score
|
||||
if vid not in max_similarities:
|
||||
max_similarities[vid] = {}
|
||||
max_similarities[vid][name] = sim
|
||||
if vid not in vacancies_content:
|
||||
vacancies_content[vid] = {}
|
||||
vacancies_content[vid]["content"] = res.payload["content"]
|
||||
vacancies_content[vid]["link"] = res.payload["link"]
|
||||
results = qdrant_client.query_points(
|
||||
collection_name="vacancies",
|
||||
query=vec,
|
||||
using=name,
|
||||
limit=100000,
|
||||
with_payload=True,
|
||||
query_filter=query_filter,
|
||||
)
|
||||
for res in results.points:
|
||||
vid = res.id
|
||||
sim = res.score
|
||||
if vid not in max_similarities:
|
||||
max_similarities[vid] = {}
|
||||
max_similarities[vid][name] = sim
|
||||
if vid not in vacancies_content:
|
||||
vacancies_content[vid] = {}
|
||||
vacancies_content[vid]["content"] = res.payload["content"]
|
||||
vacancies_content[vid]["features_json"] = res.payload["features_json"]
|
||||
vacancies_content[vid]["link"] = res.payload["link"]
|
||||
|
||||
scored = []
|
||||
for vid, feature_sims in max_similarities.items():
|
||||
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
|
||||
scored.append({"id": vid, "score": total, "content": vacancies_content[vid]["content"], "link": vacancies_content[vid]["link"]})
|
||||
scored.append({
|
||||
"id": vid,
|
||||
"score": total,
|
||||
"content": vacancies_content[vid]["content"],
|
||||
"features_json": vacancies_content[vid]["features_json"],
|
||||
"link": vacancies_content[vid]["link"],
|
||||
"sims": feature_sims,
|
||||
})
|
||||
|
||||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||||
import pprint
|
||||
pprint.pprint(scored[:5])
|
||||
|
||||
prompt = f"""
|
||||
Резюме: {cv.payload['content']}
|
||||
|
||||
Среди вакансий ниже выбери одну наиболее релевантную и выведи ее индекс(от 0 до 9).
|
||||
Иногда могут попадаться чужие резюме вместо вакансий, их отдавать нельзя.
|
||||
В ответе выведи только число. Если среди вакансий нет подходящих, то верни -1.
|
||||
{scored[:10]}
|
||||
"""
|
||||
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1)
|
||||
response = openai_client.invoke(prompt)
|
||||
index = int(response.content)
|
||||
if index == -1:
|
||||
return None
|
||||
|
||||
return scored[index]["id"], scored[index]["content"], scored[index]["link"]
|
||||
return scored[0]["id"], scored[0]["content"], scored[0]["link"]
|
||||
|
||||
|
||||
def extract_features(content: str) -> VacancyFeatures:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user