|
最近有个项目要采集淘宝哇哦视频,没有相应的API,只能靠抓包获取h5api接口了。
切入正题,开工:
先获取页面加载h5api时的cookies:
这个是页面地址:https://market.m.taobao.com/app/ ... uijian.no_banner_27
通过抓包可以看到加载的h5api是
https://h5api.m.taobao.com/h5/mtop.mediainteraction.video.detail/1.0/?jsv=2.4.5&appKey=12574478&t=1558066703249&sign=2f83acb0b1ec3eacf555f158bd2ad048&api=mtop.mediainteraction.video.detail&v=1.0&type=jsonp&dataType=jsonp&timeout=20000&preventFallback=true&callback=mtopjsonp1&data=%7B%22type%22%3A%22guang%22%2C%22id%22%3A%22307459476%22%2C%22extParams%22%3A%22%7B%5C%22spm-cnt%5C%22%3A%5C%22a310p.11570659%5C%22%2C%5C%22spm-url%5C%22%3A%5C%22a310p.11215598.tuijian.no_banner_27%5C%22%2C%5C%22page%5C%22%3A%5C%22guang%5C%22%2C%5C%22product_type%5C%22%3A%5C%22videointeract%5C%22%2C%5C%22echoParam%5C%22%3A%7B%7D%7D%22%7D
我们获取cookies的目的就是生成sign这个参数;
如何获取cookie呢?
cookie地址:https://h5api.m.taobao.com/h5/mt ... .0/?appKey=12574478
只需要访问这个页面,从这个页面获取cookie就行了。
PHP可以通过curl模拟访问。
有一点要注意的是:
h5api是需要验证sign和cookie的,如果有一项不正确,则请求就会失败,然后服务端会重新生成sign进行验证,但是cookie不会变。
只要知道这一点就好办了。
先通过php的curl请求cookie地址,要连续请求两次,第二次才可以获取到带cookie的header;从中取出_m_h5_tk和_m_h5_tk_enc这两个值就行了。
cookie只需要包含这两项,其中生成sign需要_m_h5_tk去掉后面的短横线和时间戳,然后拼接系统当前时间的13位时间戳,再拼接appkey,再拼接请求的data数据,MD5加密就是sign;
通过这个sign请求h5api数据接口就可以获得想要的东西了。
下面附上demo:
获取cookie:
- function getcookie(){
- $tmp_url = "https://market.m.taobao.com/app/tb-windmill-app/ishopping/index";//伪造来路
- $Browser = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36';//模拟UA 这里用的是浏览器的
- $cookie="";//初始化cookie
- $headers = array('Content-type:application/x-www-form-urlencoded','Accept:application/json');//发送请求的header
- for($j=0;$j<=2;$j++){ //需要请求两次,因为第一次访问失败之后才会生成cookie
- $url="https://h5api.m.taobao.com/h5/mtop.mediainteraction.video.detail/1.0/?appKey=12574478";//请求地址,必须带上这个默认的appkey
- $ch = curl_init($url);
- curl_setopt($ch,CURLOPT_HEADER,1);//输出头部信息,cookie就包含其中
- curl_setopt($ch,CURLOPT_REFERER, $tmp_url);
- curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
- curl_setopt($ch, CURLOPT_USERAGENT, $Browser);
- curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
- curl_setopt($ch,CURLOPT_COOKIE,$cookie);
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
- curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
- $content = curl_exec($ch);
- curl_close($ch);
- $_m_h5_tk=$this->get_word($content,'_m_h5_tk=', ';'); //取出_m_h5_tk
- $_m_h5_tk_enc=$this->get_word($content,'_m_h5_tk_enc=', ';'); //取出_m_h5_tk_enc
- if($_m_h5_tk && $_m_h5_tk_enc){
- return "_m_h5_tk_enc=".$_m_h5_tk_enc."; _m_h5_tk=".$_m_h5_tk;
- }
-
复制代码
抓取数据:
- function getejson(){
- $cookie = $this->getcookie();
- $tmp_url = "https://market.m.taobao.com/app/tb-windmill-app/ishopping/index";
- $Browser = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36';
- $appKey= 12574478;
- $_m_h5_tk= get_word($cookie,'_m_h5_tk=', '_');//从cookie中取出_m_h5_tk,必须要去掉后面的部分
- $t =$this->getMillisecond();//生成时间戳
- $data ='{"type":"guang","id":"301740051","extParams":"{"spm-cnt":"a310p.11570659","spm-url":"a310p.11215598.tuijian.no_banner_2","page":"guang","product_type":"videointeract","echoParam":{}}"}';//请求的数据
- $url_data = urlencode($data);//请求的数据编码后要拼接到地址上。
- $headers = array('Content-type:application/x-www-form-urlencoded','Accept:application/json');
- $sign=md5($_m_h5_tk."&".$t."&".$appKey."&".$data); //生成sign
- $url = "https://h5api.m.taobao.com/h5/mtop.mediainteraction.video.detail/1.0/?jsv=2.4.5&appKey=12574478&t=".$t."&sign=".$sign."&api=mtop.mediainteraction.video.detail&v=1.0&timeout=20000&data=".$url_data;
- $ch = curl_init($url);
- curl_setopt($ch,CURLOPT_HEADER,0);
- curl_setopt($ch,CURLOPT_REFERER, $tmp_url);
- curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
- curl_setopt($ch, CURLOPT_USERAGENT, $Browser);
- curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
- curl_setopt($ch,CURLOPT_COOKIE,$cookie);
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
- curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
- $content = curl_exec($ch);
- curl_close($ch);
- return $content;
- }
复制代码 用到的自定义函数:
- function getMillisecond() {
- list($t1, $t2) = explode(' ', microtime());
- return (float)sprintf('%.0f',(floatval($t1)+floatval($t2))*1000);
- }
复制代码- function get_word($html,$star,$end){
- $pat = '/'.$star.'(.*?)'.$end.'/s';
- if(!preg_match_all($pat, $html, $mat)) {
- }else{
- $wd= $mat[1][0];
- }
- return $wd;
- }
复制代码 注意data和h5api链接中包含spm-cnt和spm-url这两项值也非常重要,必须保持一致。
最终输出结果:
|
|