I have written this python web-scrapper script and have deployed it on Heroku. It is continuous running script and does web scrapping after each minute of sleeping . The problem is that it works fine for few minutes then stops doing anything. I think dyno goes to sleep. How can i prevent that? how can i keep my script running without dyno idling or some other problem is happening here?
import time
from bs4 import BeautifulSoup
import urllib.request
import schedule
from bs4.element import Tag
dtu_url="http://dtu.ac.in/"
prev_news = [] # Store the previous state of news to compare changes
prev_updated = "Initial Value"
updated_news = []
def compare_variables(prev, curr):
"""
Compare two variables and print the comparison result.
"""
if prev == curr:
print("No change values")
return False
else:
print("Change detected")
return True
def fetch_last_updated() -> str:
'''
Fetches the last updated information from DTU Official Webpage
---
Returns
String containing the last updated information
'''
page = urllib.request.urlopen(dtu_url)
soup = BeautifulSoup(page.read(), 'html.parser')
bottom_section = soup.find('div', id='bottom_Section')
last_updated = bottom_section.get_text().split(
'Last updated :')[1].split('\n')[0].strip()
return last_updated
def fetch_latest_news() -> list:
'''
Fetches the latest news from DTU Official Webpage
---
Returns
List of dictionaries containing information about the latest news
'''
page = urllib.request.urlopen(dtu_url)
soup = BeautifulSoup(page.read(), 'html.parser')
# Extracting tabs of webpage containing information
latest_tab = soup.find_all('div', class_='latest_tab')
latest_news = latest_tab_extractor(latest_tab[0])
return latest_news
def latest_tab_extractor(html_component: Tag) -> list:
'''
Converts the HTML under latest_tab div
from DTU Webpage into meaningful Array
'''
result = []
try:
item: Tag
for item in html_component.find_all('li'):
try:
a_tag = item.find_all('a')
if len(a_tag) > 1:
sub = []
i: Tag
for i in a_tag:
try:
sub.append({
'name': str(i.get_text()).replace('||', '').strip(),
'link': convert_link(i.get('href'))
})
except:
None
if item.a.get('href') == None:
if item.h6.string == None and a_tag[0].link == None:
name = a_tag[0].get_text()
sub.pop(0)
else:
name = item.h6.string
result.append({
'name': name.strip(),
'sub_list': sub
})
else:
for x in sub:
result.append(x)
else:
result.append({
'name': item.get_text().strip(),
'link': convert_link(item.a.get('href'))
})
if str(result[len(result)-1]).__contains__('view all'):
result.pop()
break
except:
None
except:
if(len(result) == 0):
result.append({
'name': 'End of results',
'link': 'http://dtu.ac.in/'
})
finally:
return result
def convert_link(url: str, domain: str = dtu_url) -> str:
if url != None:
if url[0] == "https://stackoverflow.com/" or url[0] == '.':
return url.replace(
"https://stackoverflow.com/", domain, 1) if url[0] == "/" else(url.replace('./', domain))
try:
url.index('http')
url.index('://')
return url
except:
return domain + "https://stackoverflow.com/" + url
else:
return None
def job():
# Fetch last updated information from DTU Official Webpage
curr_updated = fetch_last_updated()
print("Runnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnning")
# Compare the two variables
if compare_variables(prev_updated, curr_updated):
# Changes detected in last updated information
print("Last Updated has changed to :", curr_updated)
# Fetch latest news from DTU Official Webpage
curr_news = fetch_latest_news()
# Update the previous value for the next iteration
prev_updated = curr_updated
schedule.every(1).minutes.do(job)
if __name__ == "__main__":
while True:
schedule.run_pending()
time.sleep(1) # wait one minute
Maybe you could try to ping you app with an external app (e.g Kaffeine) in regular intervals.
Upgrade from Eco to Basic (or higher)