I would like to fill out a form using formrequest and scrapy. However, I'am stuck and I don't know how to continue to submit the form . I would be very grateful if someone could help me. This is the code I'm using:
import scrapy
def authentication_failed(response):
# TODO: Check the contents of the response and return True if
it failed
# or False if it succeeded.
pass
class IdealistaSpider(scrapy.Spider):
name = "MiPrimerSpider"
custom_settings = {'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
start_urls = https://www.idealista.com/inmueble/94342193/']
def parse(self, response):
return scrapy.FormRequest.from_response(response,formxpath="//div[#class='modulecontact_form']/form[#class='formcontact']",
formdata={'contact-email': 'an email','contact-phone':'a number phone','contact-name':'costaman','privacy-policy-checkbox':' ','recommendations-checkbox':None},
clickdata={'class': 'btn action txt-bold txt-big desktop'},
callback=self.after_login
)
def after_login(self, response):
if authentication_failed(response):
self.logger.error("Login failed")
return
Related
I tries to login to the website using username and password. when I make a #HttpURLConnection and post it, the status code is 200 but it actually doesn't login. when I checked the login process with Chrome #DevTools Console, I found that after entering the login button, the parameters are sent to the address I used and it returned 302 as a status code. even I add this line to my code by the result doesn't changed.
connection2.setInstanceFollowRedirects(true);
here is my code.
String loginPageURL = "https://AAAAAAAAAA";
CookieManager cookieManager = new CookieManager();
cookieManager.setCookiePolicy(CookiePolicy.ACCEPT_ALL);
cookies.forEach(cookie -> cookieManager.getCookieStore().add(null, cookie));
URL url2 = new URL(loginPageURL);
HttpURLConnection connection2 = (HttpURLConnection) url2.openConnection();
connection2.setRequestProperty("Cookie",
StringUtils.join(cookieManager.getCookieStore().getCookies(), ";"));
connection2.setInstanceFollowRedirects(true);
String loginPayload ="mypayload";
connection2.setRequestMethod("POST");
connection2.setDoOutput(true);
connection2.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
connection2.setRequestProperty("Accept-Encoding", "deflate, br");
connection2.setRequestProperty("Accept-Language", "en-US,en;q=0.9,fa;q=0.8");
connection2.setRequestProperty("Cache-Control", "max-age=0");
connection2.setRequestProperty("Connection", "keep-alive");
connection2.setRequestProperty("Content-Length", String.valueOf(loginPayload.length()));
connection2.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
connection2.setRequestProperty("Host", "https://BBBBBBBBBB");
connection2.setRequestProperty("Origin", "https://BBBBBBBBBB");
connection2.setRequestProperty("Referer", "https://AAAAAAAAAA");
connection2.setRequestProperty("sec-ch-ua", " Not A;Brand;v=99, Chromium;v=100, Google Chrome;v=100");
connection2.setRequestProperty("sec-ch-ua-mobile", "?0");
connection2.setRequestProperty("sec-ch-ua-platform", "Windows");
connection2.setRequestProperty("Sec-Fetch-Dest", "document");
connection2.setRequestProperty("Sec-Fetch-Mode", "navigate");
connection2.setRequestProperty("Sec-Fetch-Site", "same-origin");
connection2.setRequestProperty("Sec-Fetch-User", "?1");
connection2.setRequestProperty("Upgrade-Insecure-Requests", "1");
connection2.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36");
DataOutputStream out = new DataOutputStream(connection2.getOutputStream());
out.writeBytes(loginPayload);
System.out.println("login connection status code: "+connection2.getResponseCode());
System.out.println("content length "+loginPayload.length());
out.close();
System.out.println("*************************************************************");
int status = connection2.getResponseCode();
if (status == HttpURLConnection.HTTP_OK) {
String header = connection2.getHeaderField("Location");
System.out.println(header);
}
Anybody can help me figuring out where the problem is?
thanks in advance.
How can i post form data on the url: http://washington.kdmid.ru/queue-en/Visitor.aspx
When i submit form data with below fields i am getting same page in response instead of redirect to next page.
import requests
from bs4 import BeautifulSoup
location_url = "http://washington.kdmid.ru/queue-en/visitor.aspx"
s = requests.Session()
main_page = s.get(location_url)
main_html = BeautifulSoup(main_page.text)
c_form = main_html.find_all("form")[0]
c_form_submit = c_form.attrs["action"]
data = {e.attrs.get("name"): e.attrs.get("value") for e in c_form.find_all("input")}
data["ctl00$MainContent$txtFam"] = "bsssabassra"
data["ctl00$MainContent$txtIm"] = "Akssssshassya"
data["ctl00$MainContent$txtOt"] = "a"
data["ctl00$MainContent$txtTel"] = "1122334455"
data["ctl00$MainContent$txtEmail"] = "akssbsars2#gmail.com"
data["ctl00$MainContent$DDL_Day"] = 1
data["ctl00$MainContent$DDL_Month"] = 1
data["ctl00$MainContent$TextBox_Year"] = 1993
data["ctl00$MainContent$DDL_Mr"] = "MR"
data["ctl00$MainContent$txtCode"] = captcha_txt
data["ctl00$MainContent$ButtonA"] = "Next"
import json; json.dumps(data)
submit_captcha_resp = s.post("http://washington.kdmid.ru/queue-en/visitor.aspx",
data=json.dumps(data))
final_page = BeautifulSoup(submit_captcha_resp.text)
It wont redirect, because it's not a browser. BS don't run the JS scripts or HTML code. But you get the response.
You should use one of these:
submit_captcha_resp = s.post("yourLongURL", json=data)
or
submit_captcha_resp = s.post("yourLongURL", data=data)
json.dumps() is used to convert a JSON to a string but you don't need that because the webpage which you are posting data uses HTML tag and form tag posts the data without converting it to string. So you shouldn't convert it to a string. You should post it in JSON format.
And as #dharmey said: If you get a 404, you should set a user agent as a popular web browser. For example:
{"User-Agent":"Mozilla/5.0"}
And I think now you have bigger problems like passing the Captcha.
I think you might be posting the data in the wrong way. You could try
submit_captcha_resp = s.post("http://washington.kdmid.ru/queue-en/visitor.aspx",
json=data)
Instead of data=json.dumps(data))
If this dosen't work / the site requires actual form data, try to pass in some headers, as they might be required for the server to recieve the request correctly.
You could just include
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
submit_captcha_resp = s.post("http://washington.kdmid.ru/queue-en/visitor.aspx",
headers=headers, data=json.dumps(data))
to start out.
When I add the command-line option "-simulation" as per the Gatling web site to the "gatling.bat" file, which comes as part of the "gatling-charts-highcharts-bundle-3.3.1" download.
I get an error saying "Warning: Unknown option -simulation"
I don't understand why?
Im using Visual Studio Code and running "gatling.bat" from the Powershell Terminal window.
If i don't put any command-line options it runs as expected.
Here is some of my code and galting website screenshots
Gatling Website Screensots
Note it says that Gatling can be started
My Simulation Code
Note the classname is "AllTests"
package api
//Import API Object's to include in test
import api1.{foo => foofoo}
import api2.{bar => barbar}
//Gatling and Scala imports
import io.gatling.core.Predef._
import io.gatling.http.Predef._
import scala.concurrent.duration._
import java.util.concurrent.ThreadLocalRandom
import com.typesafe.config._
class AllTests extends Simulation {
/*
val conf = ConfigFactory.load("application.conf");
val baseUrl = conf.getString("base-app.baseurl")
*/
val httpProtocol = http
.baseUrl("https://api.accp.qqq/")
.header("Sec-Fetch-Site","same-origin")
.header("Sec-Fetch-Mode","cors")
.header("Sec-Fetch-Dest","empty")
.acceptHeader("application/json")
.acceptLanguageHeader("en-US,en;q=0.9")
.acceptEncodingHeader("gzip, deflate, br")
.userAgentHeader("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36 Edg/86.0.622.38")
setUp(
foofoo.users.inject(atOnceUsers(1)),
barbar.users.inject(atOnceUsers(1))
).protocols(httpProtocol)
}
Here is the "gatling.bat" with the command-line option
Note the warning
C:\Users\xxx\source\perfrepo\Test01\bin> .\gatling.bat -simulation AllTests
GATLING_HOME is set to "C:\Users\xxx\source\perfrepo\Test01"
JAVA = ""C:\Program Files\Java\jdk-11.0.2\bin\java.exe""
Warning: Unknown option -simulation
Warning: Unknown argument 'AllTests'
Choose a simulation number:
[0] computerdatabase.advanced.AdvancedSimulationStep01
[1] computerdatabase.advanced.AdvancedSimulationStep02
[2] computerdatabase.advanced.AdvancedSimulationStep03
[3] computerdatabase.advanced.AdvancedSimulationStep04
[4] computerdatabase.advanced.AdvancedSimulationStep05
[5] api.AllTests
That's a error in our documentation generation, where -- (double dashes) were transformed into single ones (see doc sources).
I’m having a problem with the HttpClient library in java.
The target web site is on SSL (https://www.betcris.com), and I can load the index page from that site just fine .
However, the different pages showing odds for the different sports returns a 403 response code with HttpClient, but loading the same pages in a browser works just fine.
Here is such a page : https://www.betcris.com/en/live-lines/soccer.
I started troubleshooting this page with the information gathered by HttpFox (a Firefox add-on that resembles LiveHttpHeaders), making sure I had all the correct request headers and cookies, but I couldn’t get it to load using HttpClient. I also determined that cookies have nothing to do with the problem, as I can remove all cookies for that web site within my browser, and then hit the page directly and it will load.
I confirmed that there’s something special going on with these pages by using the online tool at http://www.therightapi.com/test. This tool allows you to input the url of a page along with any Request header you want, and shows you the response you get from the target web site. Using that tool, I can load https://www.google.com just fine, but I get the same 403 error when trying to load https://www.betcris.com/en/live-lines/soccer.
Here's my setup at therightapi :
And the response :
Does anyone know what’s going on here ?
Thanks.
EDIT : I've created a test project, here's the java code, followed by the maven dependency you should have in your pom :
package com.yourpackage;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
public class TestHttpClient {
public static void main(String[] args) {
String url = "https://www.betcris.com/en/live-lines/soccer";
HttpClient client = HttpClientBuilder.create().build();
HttpGet request = new HttpGet(url);
// add request header
request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0");
try {
HttpResponse response = client.execute(request);
System.out.println("Response Code : "
+ response.getStatusLine().getStatusCode());
BufferedReader rd = new BufferedReader(
new InputStreamReader(response.getEntity().getContent()));
StringBuffer result = new StringBuffer();
String line = "";
while ((line = rd.readLine()) != null) {
result.append(line);
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
I have solved this problem (avoiding 403) by setting up User-Agent property while making a request as like follow:
If you use HttpClient
HttpGet httpGet = new HttpGet(URL_HERE);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64)
AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
If you use HttpURLConnection
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64)
AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
I use the following code to consume HTTPS Urls:
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.ssl.SSLContextBuilder;
...
SSLContext sslContext =
new SSLContextBuilder().loadTrustMaterial(null, (certificate, authType) -> true).build();
try (CloseableHttpClient httpClient = HttpClients.custom().setSSLContext(sslContext)
.setSSLHostnameVerifier(new NoopHostnameVerifier()).build()) {
HttpGet httpGet = new HttpGet("YOUR_HTTPS_URL");
httpGet.setHeader("Accept", "application/xml");
httpGet.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
HttpResponse response = httpClient.execute(httpGet);
logger.info("Response: " + response);
}
pom.xml:
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
In my case, the web server does not use a proxy to communicate with APIs.
I just disbaled the defaultproxy under system.net in web.config.
<system.net>
<defaultProxy enabled="false" />
</system.net>
403 Forbidden is used to signal an authentication requirement. In fact, the full 403 response should tell you exactly that. Luckily, HttpClient can do authentication.
I have a scala action like this:
def myAction(myParam: String) = Action { implicit request =>
}
How can I check if a request is from mobile browser? I searched and found nothing online.
To do this you can inspect "User-Agent" header in the request.
You can get regexp that matches mobile user agents from JSP snippet located here (JSP is based on Java and you can use Java's regexp-related classes and patterns in Scala, so you won't have to do much to use it in your code).
To get user agent you can use something like this:
request.headers().get("User-Agent")
Example of user agent check:
val ua = "Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"
if (ua.matches("(?i).*((android|bb\\d+|meego).+mobile|avantgo|bada\\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\\.(browser|link)|vodafone|wap|windows ce|xda|xiino).*")||ua.substring(0,4).matches("(?i)1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\\-(n|u)|c55\\/|capi|ccwa|cdm\\-|cell|chtm|cldc|cmd\\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\\-s|devi|dica|dmob|do(c|p)o|ds(12|\\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\\-|_)|g1 u|g560|gene|gf\\-5|g\\-mo|go(\\.w|od)|gr(ad|un)|haie|hcit|hd\\-(m|p|t)|hei\\-|hi(pt|ta)|hp( i|ip)|hs\\-c|ht(c(\\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\\-(20|go|ma)|i230|iac( |\\-|\\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\\/)|klon|kpt |kwc\\-|kyo(c|k)|le(no|xi)|lg( g|\\/(k|l|u)|50|54|\\-[a-w])|libw|lynx|m1\\-w|m3ga|m50\\/|ma(te|ui|xo)|mc(01|21|ca)|m\\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\\-2|po(ck|rt|se)|prox|psio|pt\\-g|qa\\-a|qc(07|12|21|32|60|\\-[2-7]|i\\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\\-|oo|p\\-)|sdk\\/|se(c(\\-|0|1)|47|mc|nd|ri)|sgh\\-|shar|sie(\\-|m)|sk\\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\\-|v\\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\\-|tdg\\-|tel(i|m)|tim\\-|t\\-mo|to(pl|sh)|ts(70|m\\-|m3|m5)|tx\\-9|up(\\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\\-|your|zeto|zte\\-")) {
println("mobile")
} else {
println("not mobile")
}