您现在的位置是:首页 > PHP框架交流PHP框架交流

thinkphp使用querylist采集笑话网站笔记

上善若水2020-07-15 00:34:19【PHP框架交流】 3329人已围观

简介thinkphp使用querylist采集笑话网站笔记,使用thinkphp commands命令写法采集小说数据,不多说直接上代码. 1.创建数据采集记录表CREATE TABLE `joke_li

thinkphp使用querylist采集笑话网站笔记,使用thinkphp commands命令写法采集小说数据,不多说直接上代码.

1.创建数据采集记录表

CREATE TABLE `joke_list` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `category` varchar(255) DEFAULT NULL,
  `title` varchar(255) DEFAULT NULL,
  `link` varchar(255) DEFAULT NULL,
  `status` tinyint(4) DEFAULT '1',
  `create_time` datetime DEFAULT NULL,
  `update_time` datetime DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
CREATE TABLE `joke_detail` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `category` varchar(255) DEFAULT NULL,
  `title` varchar(255) DEFAULT NULL,
  `tag` varchar(255) DEFAULT NULL,
  `description` varchar(2000) DEFAULT NULL,
  `content` text,
  `link` varchar(255) DEFAULT NULL,
  `status` tinyint(4) DEFAULT '1',
  `create_time` datetime DEFAULT NULL,
  `update_time` datetime DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

2.编写数据库模型

JokeDetail.php

<?php
declare (strict_types = 1);

namespace app\model;

use think\Model;

/**
 * @mixin \think\Model
 */
class JokeDetail extends Model
{
    protected $autoWriteTimestamp = 'datetime';
}

JokeList.php

<?php
declare (strict_types = 1);

namespace app\model;

use think\Model;

/**
 * @mixin \think\Model
 */
class JokeList extends Model
{
    protected $autoWriteTimestamp = 'datetime';
}

3.安装querylist框架

执行composer安装命令composer require jaeger/querylist
参考安装文档

4.创建commands编写采集脚本代码

<?php
declare (strict_types=1);

namespace app\command;

use app\model\JokeDetail;
use app\model\JokeList;
use think\console\Command;
use think\console\Input;
use think\console\Output;
use QL\QueryList;
use QL\Ext\AbsoluteUrl;
use GuzzleHttp\Exception\RequestException;
use think\facade\Log;

class Joke extends Command
{
    protected function configure()
    {
        // 指令配置
        $this->setName('caiji')
            ->setDescription('the caiji command');
    }

    protected function execute(Input $input, Output $output)
    {
        $this->task();
        $this->task2();
    }

    protected function task()
    {
        $domain = "https://xiaohua.bbwx.com/";
        $ql = QueryList::getInstance();
        $ql->use(AbsoluteUrl::class);
        $rules = array(
            'category' => ['header>a', 'text'],
            'link' => ['h2 a', 'href'],
            'title' => ['h2 a', 'text']
        );
        // 切片选择器
        $range = ".content .excerpt-text";
        $i = 1;
        while (true) {
            try {
                $q1 = $ql->get("https://xiaohua.bbwx.com/page/{$i}", null, ["timeout" => 30])->absoluteUrl($domain);
                $rt1 = $q1->rules($rules)
                    ->range($range)->query()->getData()->all();
                Log::INFO("开始采集第{$i}页数据!");
                if ($rt1) {
                    $jokeList = new JokeList();
                    $jokeList->saveAll($rt1);
                }
            } catch (RequestException $e) {
                Log::INFO("第一页数据请求超时,正在重试!");
                continue;
            }
            if ($i >= 146) {
                break;
            }
            $i++;
        };
    }

    protected function task2()
    {
        $ql = QueryList::getInstance();
        $jokeList = JokeList::where("status", 1)->select();
        foreach ($jokeList as $joke) {
            try {
                $q1 = $ql->get($joke->link, null, ["timeout" => 30]);
                $q1->find('.content .article-content .asb')->remove();
                $joke_detail = [
                    "category"=> $joke->category,
                    "title"=> $ql->find('.article-title')->text(),
                    "tag"=>$ql->find(".article-tags>a")->text(),
                    "description"=>$ql->find("meta[name='description']")->attr("content"),
                    "content"=>htmlentities($ql->find('.content .article-content')->html()),
                    "link"=>$joke->link,
                    "status"=>2
                ];
                $jokeDetail=JokeDetail::where("link",$joke->link)->findOrEmpty();
                $jokeDetail->save($joke_detail);
                $joke->status=2;
                $joke->save();
            } catch (RequestException $e) {
                Log::INFO("请求超时{$joke->link},正在重试!");
                continue;
            }
        }
    }
}

5.注册该脚本命令config/console.php

<?php
// +----------------------------------------------------------------------
// | 控制台配置
// +----------------------------------------------------------------------
return [
    // 指令定义
    'commands' => [
        'joke' => 'app\command\Joke',
    ],
];

6.执行命令采集结果等待采集完成
PHP think joke

Tags: querylist

很赞哦! (6)

相关文章

随机图文

文章评论

站点信息

  • 建站时间:2019-10-24
  • 网站程序:Thinkphp6 Layui
  • 文章统计247篇文章
  • 标签管理标签云
  • 统计数据cnzz统计
  • 微信公众号:扫描二维码,关注我们