Redirection using Scrapy Spider Middleware (Unhandled error in Deferred) - redirect

I've made a spider using Scrapy that first solves a CAPTCHA in a redirected address before accessing the main website I intend to scrape. It says that I have an HTTP error causing an infinite loop but I can't find which part of the script is causing this.
In the middleware:
from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
class ProtectRedirectMiddleware(RedirectMiddleware):
def __init__(self, settings):
super().__init__(settings)
self.source = urllib.request.urlopen('http://sampleurlname.com/')
soup = BeautifulSoup(source, 'lxml')
def _redirect(self, redirected, request, spider, reason):
# act normally if this isn't a CAPTCHA redirect
if not self.is_protected(redirected.url):
return super()._redirect(redirected, request, spider, reason)
# if this is a CAPTCHA redirect
logger.debug(f'The protect URL is triggered for {request.url}')
request.cookies = self.bypass_protection(redirected.url)
request.dont_filter = True
return request
def is_protected(self, url):
return 'sampleurlname.com/protect' in url
def bypass_protection(self, url=None):
# only navigate if any explicit url is provided
if url:
url = url or self.source.geturl(url)
img = soup.find_all('img')[0]
imgurl = img['src']
urllib.request.urlretrieve(imgurl, "captcha.png")
return self.solve_captcha(imgurl)
# wait for the redirect and try again
self.wait_for_redirect()
return self.bypass_protection()
def wait_for_redirect(self, url = None, wait = 0.1, timeout=10):
url = self.url
for i in range(int(timeout//wait)):
time.sleep(wait)
if self.response.url() != url:
return self.response.url()
logger.error(f'Maybe {self.response.url()} isn\'t a redirect URL')
raise Exception('Timed out')
def solve_captcha(self, img, width=150, height=50):
# open image
self.img = 'captcha.png'
img = Image.open("captcha.png")
# image manipulation - simplified
# input the captcha text - simplified
# click the submit button - simplified
# save the URL
url = self.response.url()
# try again if wrong
if self.is_protected(self.wait_for_redirect(url)):
return self.bypass_protection()
# return the cookies as a dict
cookies = {}
for cookie_string in self.response.css.cookies():
if 'domain=sampleurlname.com' in cookie_string:
key, value = cookie_string.split(';')[0].split('=')
cookies[key] = value
return cookies
Then, this is the error I get when I run the scrapy crawl of my spider:
Unhandled error in Deferred:
2018-08-06 16:34:33 [twisted] CRITICAL: Unhandled error in Deferred:
2018-08-06 16:34:33 [twisted] CRITICAL:
Traceback (most recent call last):
File "/username/anaconda/lib/python3.6/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/username/anaconda/lib/python3.6/site-packages/scrapy/crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "/username/anaconda/lib/python3.6/site-packages/scrapy/crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/username/anaconda/lib/python3.6/site-packages/scrapy/core/engine.py", line 69, in __init__
self.downloader = downloader_cls(crawler)
File "/username/anaconda/lib/python3.6/site-packages/scrapy/core/downloader/__init__.py", line 88, in __init__
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
File "/username/anaconda/lib/python3.6/site-packages/scrapy/middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/username/anaconda/lib/python3.6/site-packages/scrapy/middleware.py", line 36, in from_settings
mw = mwcls.from_crawler(crawler)
File "/username/anaconda/lib/python3.6/site-packages/scrapy/downloadermiddlewares/redirect.py", line 26, in from_crawler
return cls(crawler.settings)
File "/username/...../scraper/myscraper/myscraper/middlewares.py", line 27, in __init__
self.source = urllib.request.urlopen('http://sampleurlname.com/')
File "/username/anaconda/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 564, in error
result = self._call_chain(*args)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 756, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/username/anaconda/lib/python3.6/urllib/request.py", line 532, in open
It basically repeats the bottom part of these over and over: open, http_response, error, _call_chain, and http_error_302, until these show at the end:
File "/username/anaconda/lib/python3.6/urllib/request.py", line 746, in http_error_302
self.inf_msg + msg, headers, fp)
urllib.error.HTTPError: HTTP Error 307: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Temporary Redirect
In setting.py is:
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
'myscrape.middlewares.ProtectRedirectMiddleware': 600}

Your issue has nothing to do with scrapy itself. You are using blocking requests in your middleware initiation.
This request seems to be stuck in a redirect loop. This usually happens when websites do not act appropriately and require cookies to allow you through:
First you connect and get a redirect response 30x and some setCokies headers
You redirect again but not with Cookies headers and the page lets you through
Python urllib doesn't handle cookies, so try this:
import urllib
from http.cookiejar import CookieJar
def __init__(self):
try:
req=urllib.request.Request(url)
cj = CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
response = opener.open(req)
source = response.read().decode('utf8', errors='ignore')
response.close()
except urllib.request.HTTPError as e:
logging.error(f"couldn't initiate middleware: {e}")
return
# you should use scrapy selectors instead of beautiful soup here
#soup = BeautifulSoup(source, 'lxml')
selector = Selector(text=source)
Alternatively you should use requests package that handles cookies by itself.

Related

Not able to trigger DAG from Airflow API but its working from Curl command

I am trying to trigger DAG from Airflow API through python script.
DAG is triggering from curl command but its not working from API.
import requests
url ='http://localhost:8080/api/experimental/dags/document_validation/dag_runs'
myobj = {''}
x =requests.post(url, data=myobj, headers={"Content-Type": "application/json"})
print(x.text)
Error I am getting
File "run_dag_api.py", line 6, in <module>
  `x = requests.post(url, data = myobj)`
File "/…/lib/python3.7/site-packages/requests/api.py", line 119, in post
  `return request('post', url, data=data, json=json, **kwargs)`
File "/…/lib/python3.7/site-packages/requests/api.py", line 61, in request
  `return session.request(method=method, url=url, **kwargs)`
File "/…/lib/python3.7/site-packages/requests/sessions.py", line 530, in request
  `resp = self.send(prep, **send_kwargs)`
File "/…/lib/python3.7/site-packages/requests/sessions.py", line 643, in send
  `r = adapter.send(request, **kwargs)`
File "/…/lib/python3.7/site-packages/requests/adapters.py", line 498, in send
  `raise ConnectionError(err, request=request)`
requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

flask form post : cannot submit Chinese charactor, only English words works

I cannot submit Chinese words in the form.
English word is OK.
How can I set the utf-8 ?
In the html or app.py? or html code?
Here is my page: http://shiqiu.pw/testpage
PS: I can submit Chinese word in my mac, no problem. But when I deploy my code to my server, Chinese word can not be submitted, it shows Internal Server Error
PPS:I send my post words to mysql, and then show mysql data in new page, perhaps there is something need to set with my mysql?
#myapp.py code part
with connection.cursor() as cursor:
# Create a new recrod
word = request.form.get('word')
print('word')
print(word)
meaning = request.form.get('meaning')
sql = 'INSERT INTO sqdict (word, meaning) VALUES (%s, %s)'
cursor.execute(sql, (word, meaning)) # execute
# connection is not autocommit by default. So you must commit to save your changes.
connection.commit()
with connection.cursor() as cursor:
sql = 'SELECT * from sqdict'
cursor.execute(sql)
# sqlresult = cursor.fetchone() # only show the first row
sqlresult = cursor.fetchall() # all rows
print('sqlresult')
print(sqlresult)
allwords = sqlresult
ERROR LOG BELOW
word
工作
ERROR:flask.app:Exception on /test [POST]
Traceback (most recent call last):
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 1993, in make_response
rv = self.response_class.force_type(rv, request.environ)
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/werkzeug/wrappers/base_response.py", line 269, in force_type
response = BaseResponse(*_run_wsgi_app(response, environ))
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/werkzeug/test.py", line 1119, in run_wsgi_app
app_rv = app(environ, start_response)
TypeError: 'InternalError' object is not callable
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 2311, in wsgi_app
response = self.full_dispatch_request()
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 1835, in full_dispatch_request
return self.finalize_request(rv)
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 1850, in finalize_request
response = self.make_response(rv)
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 2001, in make_response
reraise(TypeError, new_error, sys.exc_info()[2])
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/_compat.py", line 35, in reraise
raise value.with_traceback(tb)
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/flask/app.py", line 1993, in make_response
rv = self.response_class.force_type(rv, request.environ)
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/werkzeug/wrappers/base_response.py", line 269, in force_type
response = BaseResponse(*_run_wsgi_app(response, environ))
File "/srv/data/web/vhosts/default/local/lib/python3.7/site-packages/werkzeug/test.py", line 1119, in run_wsgi_app
app_rv = app(environ, start_response)
TypeError: 'InternalError' object is not callable
The view function did not return a valid response. The return type must be a string, tuple, Response instance, or WSGI callable, but it was a InternalError.
My input Chinese charactor is showed as '工作'
I don't know which is the Internalerror object.
Full code is at github,code maybe not the same, but not big difference.
I have tried request.form.getunicode('word').
Also, I tried app.config['JSON_AS_ASCII' = False).
Or, set <meta charset="UTF-8"> in the html code.
BUT, no one works.
I hope the form can support Chinese words as value.

Access Facebook Graph API in Python, Access Token usage

I'm new to Python, and have been using different tutorials/guides to build a program for getting data from a Facebook page, and write it into an AzureSQL database. In the Facebook Developer page I've set up an app, and created the AppID and AppSecret.
Here's the relevant part of the code I think the issue is with:
#create authenticated post URL
def create_post_url(graph_url, APP_ID, APP_SECRET):
post_args = "/posts/?key=value&access_token=" + APP_ID + "|" + APP_SECRET
post_url = graph_url + post_args
return post_url
#render graph url call to JSON
def render_to_json(graph_url):
web_response = urllib2.urlopen(graph_url)
readable_page = web_response.read()
json_data = json.loads(readable_page)
return json_data
def main():
#define facebook app secret and app id
APP_SECRET = "xyz"
APP_ID = "abc"
#define page username
pagelist = ["porsche"]
graph_url = "https://graph.facebook.com/"
In the Facebook Graph API Explorer, running a GET request will work great and return data.
Facebook Graph API Explorer results
Howerver if I run my code in the CLI, the resulting error is as follows:
[vagrant#localhost SomeAnalysis]$ python src/collectors/facebook/facebookScaper.py
Traceback (most recent call last):
File "src/collectors/facebook/facebookScaper.py", line 92, in <module>
main()
File "src/collectors/facebook/facebookScaper.py", line 56, in main
json_fbpage = render_to_json(current_page)
File "src/collectors/facebook/facebookScaper.py", line 25, in render_to_json
web_response = urllib2.urlopen(graph_url)
File "/usr/lib64/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib64/python2.7/urllib2.py", line 437, in open
response = meth(req, response)
File "/usr/lib64/python2.7/urllib2.py", line 550, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib64/python2.7/urllib2.py", line 475, in error
return self._call_chain(*args)
File "/usr/lib64/python2.7/urllib2.py", line 409, in _call_chain
result = func(*args)
File "/usr/lib64/python2.7/urllib2.py", line 558, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 400: Bad Request
I've been checking SO and have found similar posts but none explain how to use the Access Token. I've tried changing the part in the URL formation with APP_ID and APP_SECRET to just using the long ACCESS_TOKEN from the Graph API Explorer but that yields the same 400 error.
Any help is appreciated.
/K

Made Locust to login to a Web Application

I want locust to be able to login to my web application and start to click in the links inside the web application.
With this code I just get activity for the front page with the login and i don't get any notification from inside the application.
Code:
import random
from locust import HttpLocust, TaskSet, task
from pyquery import PyQuery
class WalkPages(TaskSet):
def on_start(self):
self.client.post("/", {
"UserName": "my#email.com",
"Password": "2Password!",
"submit": "Sign In"
})
self.index_page()
#task(10)
def index_page(self):
r = self.client.get("/Dashboard.mvc")
pq = PyQuery(r.content)
link_elements = pq("a")
self.urls_on_current_page = []
for l in link_elements:
if "href" in l.attrib:
self.urls_on_current_page.append(l.attrib["href"])
#task(30)
def load_page(self):
url = random.choice(self.urls_on_current_page)
r = self.client.get(url)
class AwesomeUser(HttpLocust):
task_set = WalkPages
host = "https://myenv.beta.webapp.com"
min_wait = 20 * 1000
max_wait = 60 * 1000
I get the follow msg in the terminal after the first round.
[2015-02-13 12:08:43,740] webapp-qa/ERROR/stderr: Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 267, in run
self.execute_next_task()
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 293, in execute_next_task
self.execute_task(task["callable"], *task["args"], **task["kwargs"])
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 305, in execute_task
task(self, *args, **kwargs)
File "/home/webapp/LoadTest/locustfile.py", line 31, in load_page
url = random.choice(self.urls_on_current_page)
File "/usr/lib/python2.7/random.py", line 273, in choice
return seq[int(self.random() * len(seq))] # raises IndexError if seq is empty
IndexError: list index out of range
[2015-02-13 12:08:43,752] webapp-qa/ERROR/stderr: Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 267, in run
self.execute_next_task()
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 293, in execute_next_task
self.execute_task(task["callable"], *task["args"], **task["kwargs"])
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 305, in execute_task
task(self, *args, **kwargs)
File "/home/webapp/LoadTest/locustfile.py", line 31, in load_page
url = random.choice(self.urls_on_current_page)
File "/usr/lib/python2.7/random.py", line 273, in choice
return seq[int(self.random() * len(seq))] # raises IndexError if seq is empty
IndexError: list index out of range
[2015-02-13 12:08:43,775] webapp-qa/ERROR/stderr: Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 267, in run
self.execute_next_task()
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 293, in execute_next_task
self.execute_task(task["callable"], *task["args"], **task["kwargs"])
File "/usr/local/lib/python2.7/dist-packages/locust/core.py", line 305, in execute_task
task(self, *args, **kwargs)
File "/home/webapp/LoadTest/locustfile.py", line 31, in load_page
url = random.choice(self.urls_on_current_page)
File "/usr/lib/python2.7/random.py", line 273, in choice
return seq[int(self.random() * len(seq))] # raises IndexError if seq is empty
IndexError: list index out of range
Your list may be empty.
#task(30)
def load_page(self):
if self.urls_on_current_page:
url = random.choice(self.urls_on_current_page)
r = self.client.get(url)
I takes time but someone may need this. My findings in your code: login requests seems not correct (check mine if correct), you cannot reach a variable defined inside of a function from another function, giving task(10) is not suitable for data setter function. Set urls_on_current_page as a class variable to serve for other class members. See my code and comment:
import random
from locust import HttpLocust, TaskSet, task
from pyquery import PyQuery
class WalkPages(TaskSet):
# define variable here to access them from inside the functions
urls_on_current_page = []
def login(self):
self.client.post("/login", data = {"UserName": "mesutgunes#email.com", "Password": "password"})
def get_urls(self):
r = self.client.get("/Dashboard.mvc")
pq = PyQuery(r.content)
link_elements = pq("a")
for link in link_elements:
if key in link.attrib and "http" not in link.attrib[key]:
# there maybe external link on the page
self.urls_on_current_page.append(link.attrib[key])
def on_start(self):
self.login()
self.get_urls()
#task(30)
def load_page(self):
url = random.choice(self.urls_on_current_page)
r = self.client.get(url)
class AwesomeUser(HttpLocust):
task_set = WalkPages
host = "https://myenv.beta.webapp.com"
min_wait = 20 * 1000
max_wait = 60 * 1000

Error with OAuth in the Google Data Protocol Client Libraries

When I was testing the code in "OAuth in the Google Data Protocol Client Libraries (http://code.google.com/apis/gdata/docs/auth/oauth.html)", I always got the following error. Anyone can give me a hint?
Error Code:
File "D:\PROJ\GAE\proj2\proj2.py", line 262, in get
return self.redirect(auth_url)
File "C:\DEV\google_appengine\v1.4.2\google\appengine\ext\webapp\__init__.py", line 380, in redirect
absolute_url = urlparse.urljoin(self.request.uri, uri)
File "C:\DEV\Python\v2.5.4\lib\urlparse.py", line 253, in urljoin
urlparse(url, bscheme, allow_fragments)
File "C:\DEV\Python\v2.5.4\lib\urlparse.py", line 154, in urlparse
tuple = urlsplit(url, scheme, allow_fragments)
File "C:\DEV\Python\v2.5.4\lib\urlparse.py", line 193, in urlsplit
i = url.find(':')
AttributeError: 'Uri' object has no attribute 'find'
Here is the code I want to fetch Google contacts info:
class Test(webapp.RequestHandler):
def get(self):
client = gdata.contacts.client.ContactsClient(source = 'www.mydomainname.com')
callback_url = 'http://%s/test2' % self.request.host
request_token = client.GetOAuthToken(['http://www.google.com/m8/feeds/'],
callback_url,
GOOGLE_KEY,
GOOGLE_SECRET)
gdata.gauth.AeSave(request_token, 'request_token')
auth_url = request_token.generate_authorization_url(google_apps_domain = None)
return self.redirect(auth_url) #Error?!
Thanks in advance!
The auth_url generated is not a string.
Just do:
return self.redirect(str(auth_url))
and it will work.