Latent Dirichlet Allocation (LDA) with Python Script

In the previous posts [1],[2] few scripts for extracting web data were created. Combining these scripts, we will create now web crawling script with text mining functionality such as Latent Dirichlet Allocation (LDA).

In LDA, each document may be viewed as a mixture of various topics. Where each document is considered to have a set of topics that are assigned to it via LDA.
Thus Each document is assumed to be characterized by a particular set of topics. This is akin to the standard bag of words model assumption, and makes the individual words exchangeable. [3]

Our web crawling script consists of the following parts:

1. Extracting links. The input file with pages to use is opening and each page is visted and links are extracted from this page using urllib.request. The extracted links are saved in csv file.
2. Downloading text content. The file with extracted links is opening and each link is visited and data (such as useful content no navigation, no advertisemet, html, title), are extracted using newspaper python module. This is running inside of function extract (url). Additionally extracted text content from each link is saving into memory list for LDA analysis on next step.
3. Text analyzing with LDA. Here thee script is preparing text data, doing actual LDA and outputting some results. Term, topic and probability also are saving in the file.

Below are the figure for script flow and full python source code.

Program Flow Chart for Extracting Data from Web and Doing LDA


# -*- coding: utf-8 -*-
from newspaper import Article, Config
import os
import csv
import time

import urllib.request
import lxml.html
import re

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora      
import gensim




regex = re.compile(r'\d\d\d\d')

path="C:\\Users\\Owner\\Python_2016"

#urlsA.csv file has the links for extracting web pages to visit
filename = path + "\\" + "urlsA.csv" 
filename_urls_extracted= path + "\\" + "urls_extracted.csv"

def load_file(fn):
         start=0
         file_urls=[]       
         with open(fn, encoding="utf8" ) as f:
            csv_f = csv.reader(f)
            for i, row in enumerate(csv_f):
               if i >=  start  :
                 file_urls.append (row)
         return file_urls

def save_extracted_url (fn, row):
    
         if (os.path.isfile(fn)):
             m="a"
         else:
             m="w"
    
       
         with open(fn, m, encoding="utf8", newline='' ) as csvfile: 
             fieldnames = ['url']
             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
             if (m=="w"):
                 writer.writeheader()
             writer.writerow(row)

urlsA= load_file (filename)
print ("Staring navigate...")
for u in urlsA:
  print  (u[0]) 
  req = urllib.request.Request(u[0], headers={'User-Agent': 'Mozilla/5.0'})
  connection = urllib.request.urlopen(req)
  print ("connected")
  dom =  lxml.html.fromstring(connection.read())
  time.sleep( 7 )
  links=[]
  for link in dom.xpath('//a/@href'): 
     try:
       
        links.append (link)
     except :
        print ("EXCP" + link)
     
  selected_links = list(filter(regex.search, links))
  

  link_data={}  
  for link in selected_links:
         link_data['url'] = link
         save_extracted_url (filename_urls_extracted, link_data)



#urls.csv file has the links for extracting content
filename = path + "\\" + "urls.csv" 
#data_from_urls.csv is file where extracted data is saved
filename_out= path + "\\"  + "data_from_urls.csv"
#below is the file where visited urls are saved
filename_urls_visited = path + "\\" + "visited_urls.csv"

#load urls from file to memory
urls= load_file (filename)
visited_urls=load_file (filename_urls_visited)


def save_to_file (fn, row):
    
         if (os.path.isfile(fn)):
             m="a"
         else:
             m="w"
    
         
         with open(fn, m, encoding="utf8", newline='' ) as csvfile: 
             fieldnames = ['url','authors', 'title', 'text', 'summary', 'keywords', 'publish_date', 'image', 'N']
             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
             if (m=="w"):
                 writer.writeheader()
             writer.writerow(row)
            


def save_visited_url (fn, row):
    
         if (os.path.isfile(fn)):
             m="a"
         else:
             m="w"
    
       
         with open(fn, m, encoding="utf8", newline='' ) as csvfile: 
             fieldnames = ['url']
             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
             if (m=="w"):
                 writer.writeheader()
             writer.writerow(row)
        
#to save html to file we need to know prev. number of saved file
def get_last_number():
    path="C:\\Users\\Owner\\Desktop\\A\\Python_2016_A"             
   
    count=0
    for f in os.listdir(path):
       if f[-5:] == ".html":
            count=count+1
    return (count)    

         
config = Config()
config.keep_article_html = True


def extract(url):
    article = Article(url=url, config=config)
    article.download()
    time.sleep( 7 )
    article.parse()
    article.nlp()
    return dict(
        title=article.title,
        text=article.text,
        html=article.html,
        image=article.top_image,
        authors=article.authors,
        publish_date=article.publish_date,
        keywords=article.keywords,
        summary=article.summary,
    )


doc_set = []

for url in urls:
    newsp=extract (url[0])
    newsp['url'] = url
    
    next_number =  get_last_number()
    next_number = next_number + 1
    newsp['N'] = str(next_number)+ ".html"
    
    
    with open(str(next_number) + ".html", "w",  encoding='utf-8') as f:
	     f.write(newsp['html'])
    print ("HTML is saved to " + str(next_number)+ ".html")
   
    del newsp['html']
    
    u = {}
    u['url']=url
    doc_set.append (newsp['text'])
    save_to_file (filename_out, newsp)
    save_visited_url (filename_urls_visited, u)
    time.sleep( 4 )
    



tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = PorterStemmer()
    

texts = []

# loop through all documents
for i in doc_set:
    
   
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
   
    stopped_tokens = [i for i in tokens if not i in en_stop]
   
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
   
    texts.append(stemmed_tokens)
    
num_topics = 2    

dictionary = corpora.Dictionary(texts)
    

corpus = [dictionary.doc2bow(text) for text in texts]
print (corpus)

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=20)
print (ldamodel)

print(ldamodel.print_topics(num_topics=3, num_words=3))

#print topics containing term "ai"
print (ldamodel.get_term_topics("ai", minimum_probability=None))

print (ldamodel.get_document_topics(corpus[0]))
# Get Per-topic word probability matrix:
K = ldamodel.num_topics
topicWordProbMat = ldamodel.print_topics(K)
print (topicWordProbMat)



fn="topic_terms5.csv"
if (os.path.isfile(fn)):
      m="a"
else:
      m="w"

# save topic, term, prob data in the file
with open(fn, m, encoding="utf8", newline='' ) as csvfile: 
             fieldnames = ["topic_id", "term", "prob"]
             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
             if (m=="w"):
                 writer.writeheader()
           
             for topic_id in range(num_topics):
                 term_probs = ldamodel.show_topic(topic_id, topn=6)
                 for term, prob in term_probs:
                     row={}
                     row['topic_id']=topic_id
                     row['prob']=prob
                     row['term']=term
                     writer.writerow(row)

References
1.Extracting Links from Web Pages Using Different Python Modules
2.Web Content Extraction is Now Easier than Ever Using Python Scripting
3.Latent Dirichlet allocation Wikipedia
4.Latent Dirichlet Allocation
5.Using Keyword Generation to refine Topic Models
6. Beginners Guide to Topic Modeling in Python

Latent Dirichlet Allocation (LDA) with Python Script

Related

Leave a Comment Cancel reply

Share this:

Related

Leave a Comment Cancel reply