Bài Báo Cáo Nhóm 1 Môn Trí Tuệ Nhân Tạo.docx

BỘ GIÁO DỤC VÀ ĐÀO TẠO TRƯỜNG ĐẠI HỌC BÁCH KHOA ĐÀ NẴNG SAU ĐẠI HỌC ~~~~~~*~~~~~~ BÀI BÁO CÁO MÔN HỌC TÊN BÀI DỰ ĐOÁN CHỦ ĐỀ TIN TỨC BẰNG MULTINOMIALNB Đà Nẵng – 2022 Học viên thực hiện NGUYỄN HOÀNG A[.]

Trang 1

BỘ GIÁO DỤC VÀ ĐÀO TẠO TRƯỜNG ĐẠI HỌC BÁCH KHOA ĐÀ NẴNG

SAU ĐẠI HỌC

~~~~~~*~~~~~~

BÀI BÁO CÁO MÔN HỌC

TÊN BÀI:

DỰ ĐOÁN CHỦ ĐỀ TIN TỨC BẰNG MULTINOMIALNB

Đà Nẵng – 2022

Học viên thực hiện : NGUYỄN HOÀNG ANH VŨ

LÊ HỒNG PHƯƠNG VƯƠNG NHẬT QUANG

Giảng viên hướng dẫn : TS PHẠM MINH TUẤN

Trang 2

Mục lục

Nội dung 3

1 Tách từ bằng thư viện pyvi 3

2 Kiểm tra các từ đã tách thuộc tính từ, động từ hay danh từ 3

3 Điếm số lượng tính từ, động từ hay danh từ trong mỗi chủ đề 4

4 Tìm xem các từ sử dụng nhiều cho mỗi chủ đề 5

5 Dự đoán chủ đề tin tức bằng MultinomialNB 6

Trang 3

Nội dung

1 Tách từ bằng thư viện pyvi.

2 Kiểm tra các từ đã tách thuộc tính từ, động từ hay danh từ.

Crawl dữ liệu từ vnexpress

import json

import requests

import sqlite3

import traceback

from tqdm import tqdm

from bs4 import BeautifulSoup, PageElement

def get_meta_data(page, name: str):

try:

meta = page.find("meta", attrs={"name": name})

return meta["content"]

except:

return None

def parse_single_news(url: str):

if url is None:

return

try:

item = BeautifulSoup(requests.get(url).text, "html.parser")

# Parse title

title = item.find("h1", class_="title-detail")

if title is None:

title = item.find("h1", class_="title-post")

title = title.text.strip() if title is not None else None

# Parse description

desc = item.find("p", class_="description")

desc = [desc.text.strip()] if desc is not None else []

# Parse content

content = item.find("article", class_="fck_detail")

if content is None:

content = item.find("div", class_="fck_detail")

content = content.find_all("p", class_="Normal")

content = "\n".join(desc + [p.text.strip() for p in content]) return {"link": url, "title": title, "content": content}

except Exception as e:

# traceback.print_exc()

Trang 4

# print(url, e)

return None

def get_news_url(article: PageElement):

if article.find(class_="info-ads") is not None or

article.find(class_="adsbyeclick") is not None:

# Ignore ads

return None

try:

title = article.find(class_="title-news")

url = title.find("a")["href"]

return url

except:

print(article)

return None

def parse_category_news_urls(category_url: str):

soup = BeautifulSoup(requests.get(category_url).text)

items = soup.find_all("article", attrs={"class": "item-news item-news-common"})

u

rls = filter(None, [get_news_url(item) for item in items])

return urls

categories = ["Thời sự", "Góc nhìn", "Thế giới", "Kinh doanh", "Khoa học",

"Thể thao", "Pháp luật",

"Giáo dục", "Sức khỏe", "Đời sống", "Du lịch", "Số hóa", "Ô tô, xe máy", "Ý kiến"]

category_urls = {

"https://vnexpress.net/thoi-su",

"https://vnexpress.net/goc-nhin",

"https://vnexpress.net/the-gioi",

"https://vnexpress.net/kinh-doanh",

"https://vnexpress.net/khoa-hoc",

"https://vnexpress.net/the-thao",

"https://vnexpress.net/phap-luat",

"https://vnexpress.net/giao-duc",

"https://vnexpress.net/suc-khoe",

"https://vnexpress.net/doi-song",

"https://vnexpress.net/du-lich",

"https://vnexpress.net/so-hoa",

"https://vnexpress.net/oto-xe-may",

"https://vnexpress.net/y-kien"

}

Trang 5

category_urls = dict(zip(categories, category_urls))

categories = ["Thời sự", "Góc nhìn", "Thế giới", "Kinh doanh", "Khoa học",

"Thể thao", "Pháp luật",

"Giáo dục", "Sức khỏe", "Đời sống", "Du lịch", "Số hóa", "Ô tô, xe máy", "Ý kiến"]

category_urls = {

"https://vnexpress.net/thoi-su",

"https://vnexpress.net/goc-nhin",

"https://vnexpress.net/the-gioi",

"https://vnexpress.net/kinh-doanh",

"https://vnexpress.net/khoa-hoc",

"https://vnexpress.net/the-thao",

"https://vnexpress.net/phap-luat",

"https://vnexpress.net/giao-duc",

"https://vnexpress.net/suc-khoe",

"https://vnexpress.net/doi-song",

"https://vnexpress.net/du-lich",

"https://vnexpress.net/so-hoa",

"https://vnexpress.net/oto-xe-may",

"https://vnexpress.net/y-kien"

}

category_urls = dict(zip(categories, category_urls))

3 Điếm số lượng tính từ, động từ hay danh từ trong mỗi chủ đề

4 Tìm xem các từ sử dụng nhiều cho mỗi chủ đề

Thống kê các từ được sử dụng trong mỗi chủ đề

import collections

import json

from tqdm import tqdm

from pyvi import ViTokenizer, ViPosTagger, ViUtils

def statistic(s: str, kind = ["A", "V", "N"]):

s = s.replace("\n", " ")

tokenized = ViTokenizer.tokenize(s)

tagged = ViPosTagger.postagging(tokenized)

parts_of_speech = list(filter(lambda i: i in kind, tagged[1]))

parts_of_speech_counter = collections.Counter(parts_of_speech)

Trang 6

words = list(filter(None, [item[0].lower() if item[1] in kind else None for item in zip(tagged[0], tagged[1])]))

word_counter = collections.Counter(words)

return parts_of_speech_counter, word_counter

with open("vnexpress.json") as f:

data = json.load(f)

counters = {}

for category in data:

category_name = category['category']

parts_of_speech_counter = collections.Counter()

words_counter = collections.Counter()

articles = filter(None, category["articles"])

for article in tqdm(articles, desc="Articles"):

content = article["content"]

if content is not None:

result = statistic(content)

parts_of_speech_counter += result[0]

words_counter += result[1]

parts_of_speech_counter = dict(parts_of_speech_counter.most_common()) words_counter = dict(words_counter.most_common())

counters[category_name] = {"parts_of_speech": parts_of_speech_counter,

"words": words_counter}

with open("thongke.json", "w", encoding='utf8') as f:

json.dump(counters, f, ensure_ascii=False)

5 Dự đoán chủ đề tin tức bằng MultinomialNB

import json

import string

from joblib import dump, load

from pyvi import ViTokenizer, ViPosTagger

from sklearn.feature_extraction.text import TfidfVectorizer

Trang 7

from sklearn import preprocessing

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import MultinomialNB

def tokenize(s: str) -> str:

return ViTokenizer.tokenize(s).lower()

with open("vnexpress.json", encoding="utf8") as f:

data = json.load(f)

X_Data = []

Y_Data = []

for category in data:

for article in category['articles']:

if article !=None and article['content'] != None:

X_Data.append(tokenize(article['content']))

Y_Data.append(category['category'])

with open("vietnamese-stopwords-dash.txt", encoding="utf8") as f:

content = f.readlines()

stop_words = [line.rstrip() for line in content]

stop_words += ["\n"]

stop_words += [c for c in string.punctuation]

tfidf_vect = TfidfVectorizer(analyzer='word', stop_words=stop_words,

max_features=3000)

X_train_tfidf = tfidf_vect.fit_transform(X_Data)

def train_model(classifier, X_data, y_data):

X_train, X_val, y_train, y_val = train_test_split(X_data, y_data,

stratify=y_data, test_size=0.1, random_state=42)

classifier.fit(X_train, y_train)

Trang 8

train_predictions = classifier.predict(X_train)

val_predictions = classifier.predict(X_val)

print("Train accuracy: ", accuracy_score(train_predictions, y_train)) print("Validation accuracy: ", accuracy_score(val_predictions, y_val))

dump(classifier, 'phanloai.joblib')

train_model(MultinomialNB(), X_train_tfidf, Y_Data)

content = ["Nội dung tin tức"]

content = [tokenize(c) for c in content]

content_tfidf = tfidf_vect.transform(content)

clf = load('phanloai.joblib')

print(clf.predict(content_tfidf))

1 Tài liệu tham khảo

o https://hyperledger-fabric.readthedocs.io/en/latest/install.html

o https://github.com/hyperledger/fabric-samples

o https://hyperledger-fabric.readthedocs.io/en/release-1.4/write_first_app.html

o https://www.youtube.com/watch?v=8tVx0r6pgU4

Tiêu đề	Dự Đoán Chủ Đề Tin Tức Bằng MultinomialNB
Tác giả	Nguyễn Hoàng Anh Vũ
Người hướng dẫn	Lê Hồng Phương Vương, Nhật Quang TS. Phạm Minh Tuấn
Trường học	Đại Học Bách Khoa Đà Nẵng
Chuyên ngành	Trí Tuệ Nhân Tạo
Thể loại	Bài Báo Cáo Môn Học
Năm xuất bản	2022
Thành phố	Đà Nẵng

Định dạng
Số trang	8
Dung lượng	116,08 KB