API

About

Recomendation

recomend.py

chat(text)

A function that takes in a text message as a parameter and returns AI-generated text responses using GPT4All model.

Parameters

str

A string representing the input text message.

Returns

list A list of AI-generated text responses.

Example

chat("Hello, how are you?") ["I'm fine, thank you. How about yourself?"]

Source code in src/recomend.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def chat(text):
    """
    A function that takes in a text message as a parameter and returns AI-generated text responses using GPT4All model.

    Parameters
    ----------
    text : str
        A string representing the input text message.

    Returns
    -------
    list
        A list of AI-generated text responses.

    Example
    -------
    >>> chat("Hello, how are you?")
    ["I'm fine, thank you. How about yourself?"]
    """

    gptj = GPT4All("ggml-gpt4all-j-v1.3-groovy", "./src/model/")
    messages = [{"role": "user", "content": text}]

    return gptj.chat_completion(messages)["choices"][0]["content"]

chat_t5(text, **kwargs)

Initializes the chatbot model and returns the model name.

Parameters

str

The input text to be processed by the chatbot model.

**kwargs : dict Optional keyword arguments to be passed to the chatbot model.

Returns

str The name of the chatbot model.

Source code in src/recomend.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def chat_t5(text, **kwargs):
    """
    Initializes the chatbot model and returns the model name.

    Parameters
    ----------
    text : str
        The input text to be processed by the chatbot model.
    **kwargs : dict
        Optional keyword arguments to be passed to the chatbot model.

    Returns
    -------
    str
        The name of the chatbot model.
    """

    MODEL_NAME = "cointegrated/rut5-base-multitask"

    tokenizer = T5Tokenizer.from_pretrained("src/model/")
    model = T5ForConditionalGeneration.from_pretrained("src/model/")
    task_prefix = "Chat: "
    inputs = tokenizer(task_prefix + text, return_tensors="pt")
    with torch.no_grad():
        hypotheses = model.generate(**inputs, num_beams=5, **kwargs)
    return tokenizer.decode(hypotheses[0], skip_special_tokens=True)

popular_items(data, data_items, genre=None, threshold_progress=40, n_items=10)

Recomends the top n popular items for a given genre.

Parameters

pd.DataFrame

dataframe containing the user-item interactions

pd.DataFrame

dataframe containing the items

str

genre of the items to be recommended

int, optional

threshold of progress items, by default 40

int, optional

count items to be return, by default 10

Returns

np.ndarray

the top n popular items for a given genre

Source code in src/recomend.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def popular_items(
    data: pd.DataFrame,
    data_items: pd.DataFrame,
    genre: str | None = None,
    threshold_progress: int = 40,
    n_items: int = 10,
):
    """
    Recomends the top n popular items for a given genre.

    Parameters
    ----------
    df : pd.DataFrame
        dataframe containing the user-item interactions
    df_items : pd.DataFrame
        dataframe containing the items
    genre : str
        genre of the items to be recommended
    threshold_progress : int, optional
        threshold of progress items, by default 40
    n : int, optional
        count items to be return, by default 10
    Returns
    -------
    output: np.ndarray
        the top n popular items for a given genre
    """
    mask = data[data["progress"] > threshold_progress][["item_id"]]
    mask = mask.value_counts()

    items_count = pd.DataFrame(
        mask,
        columns=["count"],
    ).sort_index()

    items_name = data_items[["id", "title", "genres"]]
    items_name = items_name.set_index("id")
    items_name = items_name.sort_index()

    items_name.genres = items_name.genres.fillna("Другие жанры")
    items_name.genres = items_name.genres.apply(lambda x: x.split(","))

    if genre is not None:
        items_name = items_name.explode(column="genres")
        items_name = items_name[items_name["genres"] == genre][["title"]]

    count_titles = items_name.merge(
        items_count,
        left_index=True,
        right_on="item_id",
    )

    output = count_titles.sort_values(by="count", ascending=False)
    output = output["title"].values[:n_items]
    return output

recomend_als(user_id)

Recommends books using the ALS algorithm.

Parameters

int

The ID of the user to find similar books to.

Returns

pandas.DataFrame A DataFrame containing information about the similar books.

Source code in src/recomend.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def recomend_als(user_id):
    """Recommends books using the ALS algorithm.

    Parameters
    ----------
    user_id : int
        The ID of the user to find similar books to.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing information about the similar books.
    """
    with open("./src/model/als.pickle", "rb") as f:
        model = pickle.load(f)
    similar_users = model.similar_users(int(user_id))[0]
    if similar_users == []:
        similar_users = model.similar_users(-1)[0]
    intercations, _, items_data = read_data(os.path.join("./src/"))
    intercations, _ = create_weighted_interaction_matrix(intercations)

    dataset = intercations[intercations["user_id"].isin(similar_users)].sort_values(
        by="target",
        ascending=False,
    )["item_id"]
    items_data = items_data[items_data["id"].isin(dataset)]
    return items_data

recomend_bm25(item_id)

Recommends books similar to a given book ID using the BM25 algorithm.

Parameters

int

The ID of the book to find similar books to.

Returns

pandas.DataFrame A DataFrame containing information about the similar books.

Source code in src/recomend.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def recomend_bm25(item_id):
    """
    Recommends books similar to a given book ID using the BM25 algorithm.

    Parameters
    ----------
    item_id : int
        The ID of the book to find similar books to.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing information about the similar books.
    """

    _, _, items_data = read_data(os.path.join("./src/"))

    with open(os.path.join("./src/model/bm25.pickle"), "rb") as f:
        item_model = pickle.load(f)
    similar_items = item_model.similar_items(int(item_id))[0][1:]
    books = items_data[items_data["id"].isin(similar_items)]
    return books

summarize(input_sequences)

Summarizes the input text

Parameters

str

The input text to be summarized

Returns

str summarized text

Source code in src/recomend.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def summarize(input_sequences: str):
    """Summarizes the input text

    Parameters
    ----------
    input_sequences : str
        The input text to be summarized

    Returns
    -------
    str
        summarized text
    """
    device = torch.device("cpu")

    MODEL_NAME = "UrukHan/t5-russian-summarization"
    tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

    task_prefix = "Summarize: "
    if type(input_sequences) != list:
        input_sequences = [input_sequences]
    encoded = tokenizer(
        [task_prefix + sequence for sequence in input_sequences],
        padding="longest",
        max_length=256,
        truncation=True,
        return_tensors="pt",
    )

    predicts = model.generate(**encoded.to(device))
    summary = tokenizer.batch_decode(predicts, skip_special_tokens=True)
    return summary[0]

Utils

Utils.

create_weighted_interaction_matrix(data, alpha=0.01)

Create a weighted interaction matrix based on the input DataFrame.

Parameters

pd.DataFrame

The input DataFrame containing interactions between users and items.

float, optional

A hyperparameter used to weight interactions based on recency. Default is 0.01.

Returns

Tuple[pd.DataFrame, sp.coo_matrix] The input DataFrame with added columns for days since interaction, weight, and target, and a sparse matrix representing the weighted interactions.

Source code in src/utils.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def create_weighted_interaction_matrix(data: pd.DataFrame, alpha=0.01):
    """
    Create a weighted interaction matrix based on the input DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing interactions between users and items.
    alpha : float, optional
        A hyperparameter used to weight interactions based on recency. Default is 0.01.

    Returns
    -------
    Tuple[pd.DataFrame, sp.coo_matrix]
        The input DataFrame with added columns for days since interaction, weight, and target, and a sparse matrix
        representing the weighted interactions.
    """

    data.loc[:, "target"] = (
        data["rating"].fillna(0)
        * 20 + data["progress"]
    ) / 2

    interactions_sparse = sp.coo_matrix(
        (
            data["target"].astype(float),
            (data["user_id"].astype(int), data["item_id"].astype(int)),
        ),
    )
    return data, interactions_sparse

read_data(path)

Reads data from csv files located at given path.

Parameters

str

The path where the csv files are located.

Returns

tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] A tuple containing three pandas DataFrames representing the data, users, and items csv files, respectively.

Raises

AssertionError If any of the csv files are not found at the given path.

Source code in src/utils.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def read_data(path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Reads data from csv files located at given path.

    Parameters
    ----------
    path : str
        The path where the csv files are located.

    Returns
    -------
    tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
        A tuple containing three pandas DataFrames representing the data, users,
        and items csv files, respectively.

    Raises
    ------
    AssertionError
        If any of the csv files are not found at the given path.
    """

    path_data = os.path.join(path, "data/interactions.csv")
    path_items = os.path.join(path, "data/items.csv")
    path_users = os.path.join(path, "data/users.csv")

    # logger.info(f"Reading data from csv files {path_data}")
    assert os.path.exists(path_data), f"File {path_data} not found."
    assert os.path.exists(path_items), f"File {path_data} not found."
    assert os.path.exists(path_users), f"File {path_data} not found."

    data = pd.read_csv(path_data)
    data_users = pd.read_csv(path_users)
    data_items = pd.read_csv(path_items)

    data["start_date"] = pd.to_datetime(data["start_date"])
    data["rating"] = np.array(data["rating"].values, dtype=np.float32)

    return data, data_users, data_items

Validation

fit_implicit_als(train_sparse, test_sparse=None, iterations=15, factors=20, regularization=0.1, alpha=40, fold=0)

Fits an implicit ALS model using the given training sparse matrix. Optionally evaluates the model using the given test sparse matrix and logs the results to Weights & Biases. If no test sparse matrix is given, the trained model is pickled and saved to disk.

Parameters

scipy.sparse.csr_matrix

The training sparse matrix.

scipy.sparse.csr_matrix, optional

The test sparse matrix.

int, optional

The number of iterations to run the ALS algorithm. Default is 15.

int, optional

The number of latent factors to use in the model. Default is 20.

float, optional

The regularization parameter to use in the ALS algorithm. Default is 0.1.

int, optional

The alpha hyperparameter to use in the confidence matrix of the ALS algorithm. Default is 40.

bool, optional

Whether to log the results to Weights & Biases. Default is True.

Returns

None

Source code in src/validation.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def fit_implicit_als(
    train_sparse,
    test_sparse=None,
    iterations=15,
    factors=20,
    regularization=0.1,
    alpha=40,
    fold=0,
):
    """
    Fits an implicit ALS model using the given training sparse matrix.
    Optionally evaluates the model using the given test sparse matrix and logs the results to Weights & Biases.
    If no test sparse matrix is given, the trained model is pickled and saved to disk.

    Parameters
    ----------
    train_sparse : scipy.sparse.csr_matrix
        The training sparse matrix.
    test_sparse : scipy.sparse.csr_matrix, optional
        The test sparse matrix.
    iterations : int, optional
        The number of iterations to run the ALS algorithm. Default is 15.
    factors : int, optional
        The number of latent factors to use in the model. Default is 20.
    regularization : float, optional
        The regularization parameter to use in the ALS algorithm. Default is 0.1.
    alpha : int, optional
        The alpha hyperparameter to use in the confidence matrix of the ALS algorithm. Default is 40.
    log_wandb : bool, optional
        Whether to log the results to Weights & Biases. Default is True.

    Returns
    -------
    None
    """

    params = {
        "factors": factors,
        "regularization": regularization,
        "iterations": iterations,
        "alpha": alpha,
    }
    model = AlternatingLeastSquares(**params)

    run = wandb.init(project="MFDP", name="ALS")
    wandb.config.update(params)

    model.fit(train_sparse)

    if test_sparse is not None:
        for k in [1, 5, 10, 100]:
            metrics = ranking_metrics_at_k(
                model,
                train_sparse,
                test_sparse,
                K=k,
                show_progress=False,
            )
            metrics["k"] = k
            metrics["fold"] = fold
            wandb.log(metrics)
    else:
        with open(os.path.join(sys.path[0], "./model/als.pickle"), "wb") as f:
            pickle.dump(model, f)

        artifact = wandb.Artifact("als-model", type="pickle")
        with artifact.new_file(
            os.path.join(sys.path[0], "./model/als.pickle"), mode="wb",
        ) as f:
            pickle.dump(model, f)
        run.log_artifact(artifact)
        wandb.finish()

fit_implicit_bm25(train_sparse, test_sparse=None, fold=0)

Fits an implicit ALS model using the given training sparse matrix. Optionally evaluates the model using the given test sparse matrix and logs the results to Weights & Biases. If no test sparse matrix is given, the trained model is pickled and saved to disk.

Parameters

scipy.sparse.csr_matrix

The training sparse matrix.

scipy.sparse.csr_matrix, optional

The test sparse matrix.

bool, optional

Whether to log the results to Weights & Biases. Default is True.

Returns

None

Source code in src/validation.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def fit_implicit_bm25(
    train_sparse,
    test_sparse=None,
    fold=0,
):
    """
    Fits an implicit ALS model using the given training sparse matrix.
    Optionally evaluates the model using the given test sparse matrix and logs the results to Weights & Biases.
    If no test sparse matrix is given, the trained model is pickled and saved to disk.

    Parameters
    ----------
    train_sparse : scipy.sparse.csr_matrix
        The training sparse matrix.
    test_sparse : scipy.sparse.csr_matrix, optional
        The test sparse matrix.
    log_wandb : bool, optional
        Whether to log the results to Weights & Biases. Default is True.

    Returns
    -------
    None
    """
    run = wandb.init(project="MFDP", name="BM25")

    model = BM25Recommender(K=50, B=0.5)

    model.fit(train_sparse)

    if test_sparse is not None:
        for k in [1, 5, 10, 100]:
            metrics = ranking_metrics_at_k(
                model,
                train_sparse,
                test_sparse,
                K=k,
                show_progress=False,
            )
            metrics["k"] = k
            metrics["fold"] = fold
            wandb.log(metrics)
    else:
        with open(os.path.join(sys.path[0], "./model/bm25.pickle"), "wb") as f:
            pickle.dump(model, f)

        artifact = wandb.Artifact("bm25-model", type="pickle")
        with artifact.new_file(
            os.path.join(sys.path[0], "./model/bm25.pickle"), mode="wb",
        ) as f:
            pickle.dump(model, f)
        run.log_artifact(artifact)
        wandb.finish()

split_train_test_users(interactions, num_folds=5)

Splits the interactions dataframe into train and test sets by unique users using k-fold cross-validation.

Parameters

pandas.DataFrame

The interactions dataframe.

int, optional

The number of folds for k-fold cross-validation. Default is 5.

Yields

tuple A tuple of pandas.DataFrames containing the train and test sets, respectively.

Source code in src/validation.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def split_train_test_users(interactions, num_folds=5):
    """
    Splits the interactions dataframe into train and test sets by unique users using k-fold cross-validation.

    Parameters
    ----------
    interactions : pandas.DataFrame
        The interactions dataframe.
    num_folds : int, optional
        The number of folds for k-fold cross-validation. Default is 5.

    Yields
    -------
    tuple
        A tuple of pandas.DataFrames containing the train and test sets, respectively.
    """
    for random_state in range(num_folds):
        _, sparce_mat = create_weighted_interaction_matrix(interactions)
        train, test = train_test_split(sparce_mat, random_state=random_state)
        yield train, test

App

create_product_card(st, k, data)

Creates k columns with data in Streamlit.

Parameters

streamlit

The Streamlit library.

int

The number of columns to create.

list

The list of dictionaries containing the data.

Source code in app.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def create_product_card(st, k: int, data: list):
    """
    Creates k columns with data in Streamlit.

    Parameters
    ----------
    st : streamlit
        The Streamlit library.
    k : int
        The number of columns to create.
    data : list
        The list of dictionaries containing the data.
    """
    if len(data) < k and len(data) != 0:
        # st.error("The number of data points should be greater than or equal to k.")
        k = len(data)
    elif len(data) == 0:
        st.error("The number of data points should be greater than or equal to k.")
        return

    num_data = len(data)
    num_columns = min(k, num_data)

    columns = st.columns(num_columns)
    for i in range(num_columns):
        item = data.iloc[i]

        columns[i].write(f'**{item["title"]}**')

        columns[i].write(f"**Автор:** {item['authors']}")
        columns[i].write(f"**Год:** {item['year']}")
        columns[i].write(f"**Жанры:** {item['genres']}")