How to avoid Heroku dyno sleep for my web-scrapper [duplicate]

Question 1

I have written this python web-scrapper script and have deployed it on Heroku. It is continuous running script and does web scrapping after each minute of sleeping . The problem is that it works fine for few minutes then stops doing anything. I think dyno goes to sleep. How can i prevent that? how can i keep my script running without dyno idling or some other problem is happening here?

import time
from bs4 import BeautifulSoup
import urllib.request
import schedule
from bs4.element import Tag

dtu_url="http://dtu.ac.in/"  
prev_news = []  # Store the previous state of news to compare changes
prev_updated = "Initial Value"
updated_news = []


   
def compare_variables(prev, curr):
   """
   Compare two variables and print the comparison result.
   """
   if prev == curr:
       print("No change values")
       return False
   else:
       print("Change detected")
       return True
   
def fetch_last_updated() -> str:
   '''
   Fetches the last updated information from DTU Official Webpage
   ---

   Returns

   String containing the last updated information
   '''
   page = urllib.request.urlopen(dtu_url)
   soup = BeautifulSoup(page.read(), 'html.parser')
   
   bottom_section = soup.find('div', id='bottom_Section')
   last_updated = bottom_section.get_text().split(
       'Last updated :')[1].split('\n')[0].strip()

   return last_updated


def fetch_latest_news() -> list:
   '''
   Fetches the latest news from DTU Official Webpage
   ---

   Returns

   List of dictionaries containing information about the latest news
   '''
   page = urllib.request.urlopen(dtu_url)
   soup = BeautifulSoup(page.read(), 'html.parser')
   
   # Extracting tabs of webpage containing information
   latest_tab = soup.find_all('div', class_='latest_tab')

   latest_news = latest_tab_extractor(latest_tab[0])

   return latest_news



def latest_tab_extractor(html_component: Tag) -> list:
   '''
   Converts the HTML under latest_tab div
   from DTU Webpage into meaningful Array
   '''
   result = []
   try:
       item: Tag
       for item in html_component.find_all('li'):
           try:
               a_tag = item.find_all('a')
               if len(a_tag) > 1:
                   sub = []
                   i: Tag
                   for i in a_tag:
                       try:
                           sub.append({
                               'name': str(i.get_text()).replace('||', '').strip(),
                               'link': convert_link(i.get('href'))
                           })
                       except:
                           None
                   if item.a.get('href') == None:
                       if item.h6.string == None and a_tag[0].link == None:
                           name = a_tag[0].get_text()
                           sub.pop(0)
                       else:
                           name = item.h6.string
                       result.append({
                           'name': name.strip(),
                           'sub_list': sub
                       })
                   else:
                       for x in sub:
                           result.append(x)
               else:
                   result.append({
                       'name': item.get_text().strip(),
                       'link': convert_link(item.a.get('href'))
                   })
               if str(result[len(result)-1]).__contains__('view all'):
                   result.pop()
                   break
           except:
               None
   except:
       if(len(result) == 0):
           result.append({
               'name': 'End of results',
               'link': 'http://dtu.ac.in/'
           })
   finally:
       return result

def convert_link(url: str, domain: str = dtu_url) -> str:
   if url != None:
       if url[0] == "https://stackoverflow.com/" or url[0] == '.':
           return url.replace(
               "https://stackoverflow.com/", domain, 1) if url[0] == "/" else(url.replace('./', domain))
       try:
           url.index('http')
           url.index('://')
           return url
       except:
           return domain + "https://stackoverflow.com/" + url
   else:
       return None
   

def job():
     # Fetch last updated information from DTU Official Webpage
       curr_updated = fetch_last_updated()
       print("Runnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnning")
       # Compare the two variables
       if compare_variables(prev_updated, curr_updated):
           # Changes detected in last updated information
           print("Last Updated has changed to :", curr_updated)

           # Fetch latest news from DTU Official Webpage
           curr_news = fetch_latest_news()

       # Update the previous value for the next iteration
       prev_updated = curr_updated


schedule.every(1).minutes.do(job)

if __name__ == "__main__":
  
  while True:
   schedule.run_pending()
   time.sleep(1) # wait one minute

Question 2

Maybe you could try to ping you app with an external app (e.g Kaffeine) in regular intervals.

Leave a Comment Cancel reply