Change recomendation strategy

This commit is contained in:
estromenko 2025-11-07 00:06:31 +03:00
parent 7cce1cdc04
commit df33ce79bb
4 changed files with 1294 additions and 706 deletions

View File

@ -9,6 +9,7 @@ dependencies = [
"django>=5.2.7", "django>=5.2.7",
"gunicorn>=23.0.0", "gunicorn>=23.0.0",
"langchain>=0.3.27", "langchain>=0.3.27",
"langchain-community>=0.4.1",
"langchain-openai>=0.3.35", "langchain-openai>=0.3.35",
"langchain-qdrant>=1.1.0", "langchain-qdrant>=1.1.0",
"langgraph-checkpoint-postgres>=3.0.0", "langgraph-checkpoint-postgres>=3.0.0",

1831
uv.lock

File diff suppressed because it is too large Load Diff

View File

@ -1,19 +1,18 @@
import traceback from datetime import datetime, timedelta
from itertools import batched
from concurrent.futures import ThreadPoolExecutor
from django.core.management import BaseCommand
from django.conf import settings
import clickhouse_connect import clickhouse_connect
from vacancies.main.vector_store import add_vectors, extract_features, qdrant_client from django.core.management import BaseCommand
from qdrant_client.models import OrderBy
from vacancies.conf.settings import CLICKHOUSE_HOST, CLICKHOUSE_PORT from vacancies.conf.settings import CLICKHOUSE_HOST, CLICKHOUSE_PORT
from vacancies.main.vector_store import add_vectors, extract_features, qdrant_client
clickhouse_client = clickhouse_connect.create_client(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT) clickhouse_client = clickhouse_connect.create_client(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT)
query = """ query = """
SELECT id, chat_username, telegram_id, message, timestamp SELECT id, chat_username, telegram_id, message, timestamp
FROM telegram_parser_chatmessage FROM telegram_parser_chatmessage
WHERE timestamp >= now() - INTERVAL 30 DAY WHERE timestamp >= %(timestamp)s
AND length(message) > 150 AND length(message) > 150
AND arrayCount(x -> position(message, x) > 0, [ AND arrayCount(x -> position(message, x) > 0, [
'ваканси', 'ищем', 'требуется', 'разработчик', 'будет плюсом', 'ваканси', 'ищем', 'требуется', 'разработчик', 'будет плюсом',
@ -49,24 +48,31 @@ class Command(BaseCommand):
next_page_offset = response[1] next_page_offset = response[1]
exist_points_set = tuple(set(exist_points_ids)) exist_points_set = tuple(set(exist_points_ids))
result_rows = clickhouse_client.query(query, parameters={"exist_points": exist_points_set}).result_rows response = qdrant_client.scroll(
with ThreadPoolExecutor(max_workers=settings.COLLECT_VACANCIES_BATCH_SIZE) as pool: collection_name="vacancies",
pool.map(self._process_batch, batched(result_rows, settings.COLLECT_VACANCIES_BATCH_SIZE)) limit=1,
order_by=OrderBy(
key="timestamp",
direction="desc",
),
)
last_point_timestamp = datetime.now() - timedelta(days=30)
if response:
last_point_timestamp = response[0][0].payload["timestamp"]
def _process_batch(self, result_rows): result_rows = clickhouse_client.query(
try: query,
for index, row in enumerate(result_rows): parameters={"timestamp": last_point_timestamp, "exist_points": exist_points_set},
(id, chat_username, telegram_id, message, timestamp) = row ).result_rows
link = f"https://t.me/{chat_username}/{telegram_id}" for index, row in enumerate(result_rows):
print(f"Processing {index+1}/{len(result_rows)} link: {link}") (id, chat_username, telegram_id, message, timestamp) = row
features = extract_features(message) link = f"https://t.me/{chat_username}/{telegram_id}"
print(f"Processing {index+1}/{len(result_rows)} link: {link}")
add_vectors( features = extract_features(message)
"vacancies", add_vectors(
id, "vacancies",
features.model_dump(), id,
{'content': message, 'features_json': features.model_dump(), "link": link, "timestamp": timestamp}, features.model_dump(),
) {'content': message, 'features_json': features.model_dump(), "link": link, "timestamp": timestamp},
except Exception as exc: )
traceback.print_exception(exc)

View File

@ -1,12 +1,10 @@
from qdrant_client import models from langchain_community.embeddings import DeepInfraEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI from langchain_openai import ChatOpenAI
from qdrant_client import QdrantClient from qdrant_client import QdrantClient, models
from qdrant_client.models import Filter from qdrant_client.models import Filter, HasIdCondition
from vacancies.main.models import VacancyFeatures
from vacancies.conf.settings import QDRANT_URL from vacancies.conf.settings import QDRANT_URL
from vacancies.main.models import RecommendedVacancy from vacancies.main.models import RecommendedVacancy, VacancyFeatures
from qdrant_client.models import HasIdCondition
qdrant_client = QdrantClient(url=QDRANT_URL) qdrant_client = QdrantClient(url=QDRANT_URL)
@ -16,37 +14,44 @@ FEATURE_NAMES = [
] ]
weights = { weights = {
"job_title": 25, "job_title": 42,
"employment_type": 5, "employment_type": 5,
"work_format": 5, "work_format": 5,
"experience": 8, "experience": 8,
"position_level": 12, "position_level": 5,
"industry": 10, "industry": 1,
"tech_stack": 14, "tech_stack": 10,
"location": 5, "location": 4,
"salary_range": 5, "salary_range": 10,
"languages": 5, "languages": 4,
"education": 2, "education": 2,
"schedule": 2, "schedule": 2,
"additional_requirements": 2, "additional_requirements": 2,
} }
vectors_config = { vectors_config = {
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES name: models.VectorParams(size=4096, distance=models.Distance.COSINE) for name in FEATURE_NAMES
} }
if not qdrant_client.collection_exists("vacancies"): if not qdrant_client.collection_exists("vacancies"):
qdrant_client.create_collection( qdrant_client.create_collection(
collection_name="vacancies", collection_name="vacancies",
vectors_config=vectors_config vectors_config=vectors_config,
)
qdrant_client.create_payload_index(
collection_name="vacancies",
field_name="timestamp",
field_schema="datetime",
) )
if not qdrant_client.collection_exists("cvs"): if not qdrant_client.collection_exists("cvs"):
qdrant_client.create_collection( qdrant_client.create_collection(
collection_name="cvs", collection_name="cvs",
vectors_config=vectors_config vectors_config=vectors_config,
) )
embedding = OpenAIEmbeddings(model="text-embedding-3-large") embedding = DeepInfraEmbeddings(
model_id="Qwen/Qwen3-Embedding-8B-batch",
)
def _prepare_texts(features): def _prepare_texts(features):
"""Prepare texts for each feature from features dict.""" """Prepare texts for each feature from features dict."""
@ -66,7 +71,7 @@ def add_vectors(collection_name: str, _id: int, features: dict, payload: dict):
texts = _prepare_texts(features) texts = _prepare_texts(features)
vectors = {} vectors = {}
for name, text in texts.items(): for name, text in texts.items():
vectors[name] = [0.0] * 3072 vectors[name] = [0.0] * 4096
if text: if text:
vec = embedding.embed_query(text) vec = embedding.embed_query(text)
vectors[name] = vec vectors[name] = vec
@ -93,7 +98,7 @@ def add_vectors(collection_name: str, _id: int, features: dict, payload: dict):
scored.append({"id": vid, "score": total}) scored.append({"id": vid, "score": total})
scored.sort(key=lambda x: x["score"], reverse=True) scored.sort(key=lambda x: x["score"], reverse=True)
if scored and scored[0]["score"] > 90: # threshold if scored and scored[0]["score"] > 80: # threshold
return return
qdrant_client.upsert( qdrant_client.upsert(
@ -118,48 +123,43 @@ def search_similarities(query_filter: Filter, cv_id: int):
max_similarities = {} max_similarities = {}
vacancies_content = {} vacancies_content = {}
for name, vec in cv.vector.items(): for name, vec in cv.vector.items():
if any(v != 0 for v in vec): results = qdrant_client.query_points(
results = qdrant_client.query_points( collection_name="vacancies",
collection_name="vacancies", query=vec,
query=vec, using=name,
using=name, limit=100000,
limit=100, with_payload=True,
with_payload=True, query_filter=query_filter,
query_filter=query_filter, )
) for res in results.points:
for res in results.points: vid = res.id
vid = res.id sim = res.score
sim = res.score if vid not in max_similarities:
if vid not in max_similarities: max_similarities[vid] = {}
max_similarities[vid] = {} max_similarities[vid][name] = sim
max_similarities[vid][name] = sim if vid not in vacancies_content:
if vid not in vacancies_content: vacancies_content[vid] = {}
vacancies_content[vid] = {} vacancies_content[vid]["content"] = res.payload["content"]
vacancies_content[vid]["content"] = res.payload["content"] vacancies_content[vid]["features_json"] = res.payload["features_json"]
vacancies_content[vid]["link"] = res.payload["link"] vacancies_content[vid]["link"] = res.payload["link"]
scored = [] scored = []
for vid, feature_sims in max_similarities.items(): for vid, feature_sims in max_similarities.items():
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims) total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
scored.append({"id": vid, "score": total, "content": vacancies_content[vid]["content"], "link": vacancies_content[vid]["link"]}) scored.append({
"id": vid,
"score": total,
"content": vacancies_content[vid]["content"],
"features_json": vacancies_content[vid]["features_json"],
"link": vacancies_content[vid]["link"],
"sims": feature_sims,
})
scored.sort(key=lambda x: x["score"], reverse=True) scored.sort(key=lambda x: x["score"], reverse=True)
import pprint
pprint.pprint(scored[:5])
prompt = f""" return scored[0]["id"], scored[0]["content"], scored[0]["link"]
Резюме: {cv.payload['content']}
Среди вакансий ниже выбери одну наиболее релевантную и выведи ее индекс(от 0 до 9).
Иногда могут попадаться чужие резюме вместо вакансий, их отдавать нельзя.
В ответе выведи только число. Если среди вакансий нет подходящих, то верни -1.
{scored[:10]}
"""
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1)
response = openai_client.invoke(prompt)
index = int(response.content)
if index == -1:
return None
return scored[index]["id"], scored[index]["content"], scored[index]["link"]
def extract_features(content: str) -> VacancyFeatures: def extract_features(content: str) -> VacancyFeatures: