In the previous posts [1],[2] few scripts for extracting web data were created. Combining these scripts, we will create now web crawling script with text mining functionality such as Latent Dirichlet Allocation (LDA).
In LDA, each document may be viewed as a mixture of various topics. Where each document is considered to have a set of topics that are assigned to it via LDA.
Thus Each document is assumed to be characterized by a particular set of topics. This is akin to the standard bag of words model assumption, and makes the individual words exchangeable. [3]
Our web crawling script consists of the following parts:
1. Extracting links. The input file with pages to use is opening and each page is visted and links are extracted from this page using urllib.request. The extracted links are saved in csv file.
2. Downloading text content. The file with extracted links is opening and each link is visited and data (such as useful content no navigation, no advertisemet, html, title), are extracted using newspaper python module. This is running inside of function extract (url). Additionally extracted text content from each link is saving into memory list for LDA analysis on next step.
3. Text analyzing with LDA. Here thee script is preparing text data, doing actual LDA and outputting some results. Term, topic and probability also are saving in the file.
Below are the figure for script flow and full python source code.
# -*- coding: utf-8 -*-
from newspaper import Article, Config
import os
import csv
import time
import urllib.request
import lxml.html
import re
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora
import gensim
regex = re.compile(r'\d\d\d\d')
path="C:\\Users\\Owner\\Python_2016"
#urlsA.csv file has the links for extracting web pages to visit
filename = path + "\\" + "urlsA.csv"
filename_urls_extracted= path + "\\" + "urls_extracted.csv"
def load_file(fn):
start=0
file_urls=[]
with open(fn, encoding="utf8" ) as f:
csv_f = csv.reader(f)
for i, row in enumerate(csv_f):
if i >= start :
file_urls.append (row)
return file_urls
def save_extracted_url (fn, row):
if (os.path.isfile(fn)):
m="a"
else:
m="w"
with open(fn, m, encoding="utf8", newline='' ) as csvfile:
fieldnames = ['url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if (m=="w"):
writer.writeheader()
writer.writerow(row)
urlsA= load_file (filename)
print ("Staring navigate...")
for u in urlsA:
print (u[0])
req = urllib.request.Request(u[0], headers={'User-Agent': 'Mozilla/5.0'})
connection = urllib.request.urlopen(req)
print ("connected")
dom = lxml.html.fromstring(connection.read())
time.sleep( 7 )
links=[]
for link in dom.xpath('//a/@href'):
try:
links.append (link)
except :
print ("EXCP" + link)
selected_links = list(filter(regex.search, links))
link_data={}
for link in selected_links:
link_data['url'] = link
save_extracted_url (filename_urls_extracted, link_data)
#urls.csv file has the links for extracting content
filename = path + "\\" + "urls.csv"
#data_from_urls.csv is file where extracted data is saved
filename_out= path + "\\" + "data_from_urls.csv"
#below is the file where visited urls are saved
filename_urls_visited = path + "\\" + "visited_urls.csv"
#load urls from file to memory
urls= load_file (filename)
visited_urls=load_file (filename_urls_visited)
def save_to_file (fn, row):
if (os.path.isfile(fn)):
m="a"
else:
m="w"
with open(fn, m, encoding="utf8", newline='' ) as csvfile:
fieldnames = ['url','authors', 'title', 'text', 'summary', 'keywords', 'publish_date', 'image', 'N']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if (m=="w"):
writer.writeheader()
writer.writerow(row)
def save_visited_url (fn, row):
if (os.path.isfile(fn)):
m="a"
else:
m="w"
with open(fn, m, encoding="utf8", newline='' ) as csvfile:
fieldnames = ['url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if (m=="w"):
writer.writeheader()
writer.writerow(row)
#to save html to file we need to know prev. number of saved file
def get_last_number():
path="C:\\Users\\Owner\\Desktop\\A\\Python_2016_A"
count=0
for f in os.listdir(path):
if f[-5:] == ".html":
count=count+1
return (count)
config = Config()
config.keep_article_html = True
def extract(url):
article = Article(url=url, config=config)
article.download()
time.sleep( 7 )
article.parse()
article.nlp()
return dict(
title=article.title,
text=article.text,
html=article.html,
image=article.top_image,
authors=article.authors,
publish_date=article.publish_date,
keywords=article.keywords,
summary=article.summary,
)
doc_set = []
for url in urls:
newsp=extract (url[0])
newsp['url'] = url
next_number = get_last_number()
next_number = next_number + 1
newsp['N'] = str(next_number)+ ".html"
with open(str(next_number) + ".html", "w", encoding='utf-8') as f:
f.write(newsp['html'])
print ("HTML is saved to " + str(next_number)+ ".html")
del newsp['html']
u = {}
u['url']=url
doc_set.append (newsp['text'])
save_to_file (filename_out, newsp)
save_visited_url (filename_urls_visited, u)
time.sleep( 4 )
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = PorterStemmer()
texts = []
# loop through all documents
for i in doc_set:
raw = i.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
texts.append(stemmed_tokens)
num_topics = 2
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print (corpus)
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=20)
print (ldamodel)
print(ldamodel.print_topics(num_topics=3, num_words=3))
#print topics containing term "ai"
print (ldamodel.get_term_topics("ai", minimum_probability=None))
print (ldamodel.get_document_topics(corpus[0]))
# Get Per-topic word probability matrix:
K = ldamodel.num_topics
topicWordProbMat = ldamodel.print_topics(K)
print (topicWordProbMat)
fn="topic_terms5.csv"
if (os.path.isfile(fn)):
m="a"
else:
m="w"
# save topic, term, prob data in the file
with open(fn, m, encoding="utf8", newline='' ) as csvfile:
fieldnames = ["topic_id", "term", "prob"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if (m=="w"):
writer.writeheader()
for topic_id in range(num_topics):
term_probs = ldamodel.show_topic(topic_id, topn=6)
for term, prob in term_probs:
row={}
row['topic_id']=topic_id
row['prob']=prob
row['term']=term
writer.writerow(row)
References
1.Extracting Links from Web Pages Using Different Python Modules
2.Web Content Extraction is Now Easier than Ever Using Python Scripting
3.Latent Dirichlet allocation Wikipedia
4.Latent Dirichlet Allocation
5.Using Keyword Generation to refine Topic Models
6. Beginners Guide to Topic Modeling in Python