U
    @6^J=                     @   s8   d dl Z d dlZd dlmZmZmZ G dd deZdS )    N)
SpiderFootSpiderFootPluginSpiderFootEventc                '   @   s   e Zd ZdZdddddddgd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)g!d*gdddd+Zd,d-d.d/d0d1d2d3d4d5d6d7Ze Zd8Zd8Z	d8Z
e fd9d:Zd;d< Zd=d> ZdMd?d@ZdNdAdBZdCdD ZdEdF ZdGdH ZdIdJ ZdKdL Zd8S )O
sfp_spiderzpSpider:Footprint,Investigate:Crawling and Scanning:slow:Spidering of web-pages to extract content for searching.Fr   d      Tzhttp://zhttps://pnggifjpgjpegtifftiftarpdficoflvmp4mp3avimpggzmpegisodatmovswfZrarexezipbinbz2xsldocdocxZpptpptxZxlsxlsxcsvzimage/)
robotsonlypausesecmaxpages	maxlevels
usecookiesstartfilterfiles
filtermimefilterusersnosubsreportduplicatesz*Only follow links specified by robots.txt?zAccept and use cookies?z0Number of seconds to pause between page fetches.zCPrepend targets with these until you get a hit, to start spidering.z?Maximum number of pages to fetch per starting point identified.znMaximum levels to traverse per starting point (e.g. hostname or link identified by another module) identified.z-File extensions to ignore (don't fetch them.)zMIME types to ignore.z%Skip spidering of /~user directories?z+Skip spidering of subdomains of the target?z;Report links every time one is found, even if found before?)r&   r*   r'   r+   r(   r)   r,   r-   r.   r/   r0   Nc                 C   sN   || _ |  | _|  | _|  | _d| _t| D ]}|| | j|< q6d S )NzTarget Website)	sftempStoragefetchedPages	urlEventssiteCookies__dataSource__listkeysopts)selfsfcuserOptsopt r>   5/var/www/spiderfoot.crq.systems/modules/sfp_spider.pysetupC   s    


zsfp_spider.setupc                    s  | j  }d } fdd}tt|| jd r4d S || jkrj| j d| d t| j|   | j| }| j j d|| jd | jd d	d
}d| j	 < | jd r|d d k	r|d 
dr|d 
d| j|< | j d| d t| j|    | jkr| j dd d | j < |  || j   |d d k	r||d  kr|d| j	|d < | |d | j  | j|d < |d  | j  |d |   }|d kst|dkr| j d   d S |D ]:}| jd s|| jkrq| || j  | j|< q| j dt|  |S )Nc                    s       dd d|    S )N?r   .)lowersplitendswith)exturlr>   r?   <lambda>S       z'sfp_spider.processUrl.<locals>.<lambda>r,   zRestoring cookies for z: F_fetchtimeout
_useragenti )	sizeLimitTr*   headersz
Set-CookiezSaving cookies for z0Something strange happened - shouldn't get here.realurlcontentr   zNo links found at r0   zLinks found from parsing: )r1   urlFQDNr7   filterr9   r5   debugstrfetchUrlr3   getr4   errorcontentNotify
linkNotify
parseLinks	getTargetgetNamesleninfo)r:   rH   sitecookiesZ	checkExtsfetchedlinkslinkr>   rG   r?   
processUrlN   sR    
"
 
"


zsfp_spider.processUrlc                    s   t  }|D ]ĉ | j }| j }|  |s6q
| jd rT|  j|ddsTq
|  j|ddshq
| jd r|d kr|q
|| jkr| jd r fdd	}tt	|| j| rq
| j
d
   |  | < q
|S )Nr/   F)includeChildren)includeParentsr.   z/~r&   c                    s   t | |   kp| dkS )N*)typerC   )blockedrc   r>   r?   rI      rJ   z'sfp_spider.cleanLinks.<locals>.<lambda>zAdding URL for spidering: )dictr1   
urlBaseUrlrQ   r[   matchesr9   robotsRulesr7   rR   rS   )r:   rb   returnLinksZlinkBaseZlinkFQDNZcheckRobotsr>   rj   r?   
cleanLinks   s*    
zsfp_spider.cleanLinksc                 C   sX   |   | j|rd}nd}t|tkr:t|ddd}t||| j|}| | |S )NLINKED_URL_INTERNALLINKED_URL_EXTERNALzutf-8replace)errors)	r[   rm   r1   rQ   rh   rT   r   __name__notifyListeners)r:   rH   parentEventZutypeeventr>   r>   r?   rY      s    
zsfp_spider.linkNotifyc           	      C   s   d}| drD|d  d}|s&d}n| jd D ]}||r0d}q0|rx|d d krxtd|d | j|}||_| | |d }|d krtdtj|dd	| j|}||_| | td
t	|d | j|}||_| | | dsd S |d  d}|rtd|| j|}||_| | d S )NTrN   zcontent-typer-   FrP   TARGET_WEB_CONTENTWEBSERVER_HTTPHEADERS)ensure_ascii	HTTP_CODEcodeTARGET_WEB_CONTENT_TYPE)
rV   r9   
startswithr   ru   actualSourcerv   jsondumpsrT   )	r:   rH   Z
httpresultrw   Zsendcontentctypemtrx   hdrr>   r>   r?   rX      sP    


 
 
 

 zsfp_spider.contentNotifyc                 C   s   ddgS )Nrq   INTERNET_NAMEr>   r:   r>   r>   r?   watchedEvents   s    zsfp_spider.watchedEventsc                 C   s   dgS )Nnoselfr>   r   r>   r>   r?   	watchOpts   s    zsfp_spider.watchOptsc                 C   s   ddddddgS )Nrz   r|   rq   rr   ry   r~   r>   r   r>   r>   r?   producedEvents   s
      zsfp_spider.producedEventsc           	      C   s  |j }|j}|j}d }| jd| d |  || jkrP| jd| d  d S || j|< |dkrnd|krnd S |dkr| jd D ]X}| jj|| | jd	 | jd
 d}|d d k	r|| }td|| j	|}| 
|  qqn|}|d krd S | jd| d |  || j|< | |S )NzReceived event, z, from z	Ignoring z* as already spidered or is being spidered.rq   r   r   r+   rK   rL   timeout	useragentrP   zInitiating spider of z from )	eventTypemoduledatar1   rS   r4   r9   rU   r   ru   rv   r^   
spiderFrom)	r:   rx   	eventNamesrcModuleName	eventDataZspiderTargetprefixresevtr>   r>   r?   handleEvent   s<    

 

zsfp_spider.handleEventc                 C   s0  d}d}d}t  }| j|}| jd r|| jkr| jj|d | jd | jd d}|d d k	r| jd	|d   | j|d | j|< |  rd S | 	|}|d kr| jd
 d S |r,t
|dkrt  }|D ]}	|	| jkr| jd|	 d  q|  r
 d S | jd|	  t| jd  | 	|	}
|
d k	rH||
 |d7 }|| jd kr| jdt| jd  d  d} qq| |}| jdt|  |d7 }| jdt| d t|  || jd kr | jdt| jd  d  d}t
|dkr| jd d}|  rd}qd S )NTr   r&   z/robots.txtrK   rL   r   rP   zrobots.txt contents: z"No links found on the first fetch!zAlready fetched z, skipping.zFetching fresh content from: r'      r(   zMaximum number of pages (z
) reached.FzFound links: z
At level: z	, Pages: r)   zMaximum number of levels (z*No more links found to spider, finishing..)rk   r1   rl   r9   rn   rU   rS   parseRobotsTxtcheckForStoprd   r]   r3   timesleepupdater^   rT   rp   )r:   ZstartingPointZkeepSpideringZtotalFetchedZlevelsTraversedZ	nextLinksZ
targetBaseZ	robotsTxtrb   rc   Z
freshLinksr>   r>   r?   r   (  sn     






 zsfp_spider.spiderFrom)N)N)ru   
__module____qualname____doc__r9   optdescsrk   rn   r3   r4   r5   r@   rd   rp   rY   rX   r   r   r   r   r   r>   r>   r>   r?   r      s                         ?*

+*r   )r   r   sflibr   r   r   r   r>   r>   r>   r?   <module>   s   