3
@6^J=                 @   s8   d dl Z d dlZd dlmZmZmZ G dd deZdS )    N)
SpiderFootSpiderFootPluginSpiderFootEventc            '   @   s   e Zd ZdZdddddddgd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)g!d*gdddd+Zd,d-d.d/d0d1d2d3d4d5d6d7Ze Zd8Zd8Z	d8Z
e fd9d:Zd;d< Zd=d> ZdMd?d@ZdNdAdBZdCdD ZdEdF ZdGdH ZdIdJ ZdKdL Zd8S )O
sfp_spiderzpSpider:Footprint,Investigate:Crawling and Scanning:slow:Spidering of web-pages to extract content for searching.Fr   d      Tzhttp://zhttps://pnggifjpgjpegtifftiftarpdficoflvmp4mp3avimpggzmpegisodatmovswfZrarexezipbinbz2xsldocdocxZpptpptxZxlsxlsxcsvzimage/)
robotsonlypausesecmaxpages	maxlevels
usecookiesstartfilterfiles
filtermimefilterusersnosubsreportduplicatesz*Only follow links specified by robots.txt?zAccept and use cookies?z0Number of seconds to pause between page fetches.zCPrepend targets with these until you get a hit, to start spidering.z?Maximum number of pages to fetch per starting point identified.znMaximum levels to traverse per starting point (e.g. hostname or link identified by another module) identified.z-File extensions to ignore (don't fetch them.)zMIME types to ignore.z%Skip spidering of /~user directories?z+Skip spidering of subdomains of the target?z;Report links every time one is found, even if found before?)r&   r*   r'   r+   r(   r)   r,   r-   r.   r/   r0   Nc             C   sR   || _ | j | _| j | _| j | _d| _x"t|j D ]}|| | j|< q8W d S )NzTarget Website)	sftempStoragefetchedPages	urlEventssiteCookies__dataSource__listkeysopts)selfsfcuserOptsopt r>   5/var/www/spiderfoot.crq.systems/modules/sfp_spider.pysetupC   s    


zsfp_spider.setupc                s   | j j }d } fdd}tt|| jd r4d S || jkrj| j jd| d t| j|   | j| }| j j d|| jd | jd d	d
}d| j	 < | jd r|d d k	r|d j
dr|d j
d| j|< | j jd| d t| j|    | jkr| j jdd d | j < | j || j   |d d k	r||d  kr|d| j	|d < | j|d | j  | j|d < |d  | j j |d | j j }|d kst|dkr| j jd   d S xB|D ]:}| jd s|| jkrq| j|| j  | j|< qW | j jdt|  |S )Nc                s     j  jdd jd| j   S )N?r   .)lowersplitendswith)ext)urlr>   r?   <lambda>S   s    z'sfp_spider.processUrl.<locals>.<lambda>r,   zRestoring cookies for z: F_fetchtimeout
_useragenti )	sizeLimitTr*   headersz
Set-CookiezSaving cookies for z0Something strange happened - shouldn't get here.realurlcontentr   zNo links found at r0   zLinks found from parsing: )r1   urlFQDNr7   filterr9   r5   debugstrfetchUrlr3   getr4   errorcontentNotify
linkNotify
parseLinks	getTargetgetNamesleninfo)r:   rG   sitecookiesZ	checkExtsfetchedlinkslinkr>   )rG   r?   
processUrlN   sJ    
"

"


zsfp_spider.processUrlc                s   t  }x|D ]Ɖ | jj }| jj }| j j|s8q| jd rX| j j|dd rXq| j j|ddslq| jd rd krq|| jkr| jd r fdd	}tt	|| j| rq| jj
d
   |  | < qW |S )Nr/   F)includeChildren)includeParentsr.   z/~r&   c                s   t | j|  j kp| dkS )N*)typerC   )blocked)ra   r>   r?   rH      s    z'sfp_spider.cleanLinks.<locals>.<lambda>zAdding URL for spidering: )dictr1   
urlBaseUrlrO   rY   matchesr9   robotsRulesr7   rP   rQ   )r:   r`   returnLinksZlinkBaseZlinkFQDNZcheckRobotsr>   )ra   r?   
cleanLinks   s(    

zsfp_spider.cleanLinksc             C   sX   | j  j| jj|rd}nd}t|tkr:t|ddd}t||| j|}| j| |S )NLINKED_URL_INTERNALLINKED_URL_EXTERNALzutf-8replace)errors)	rY   rj   r1   rO   rf   rR   r   __name__notifyListeners)r:   rG   parentEventZutypeeventr>   r>   r?   rW      s    
zsfp_spider.linkNotifyc       	      C   s$  d}|j drH|d j d}|s&d}n"x | jd D ]}|j|r2d}q2W |r||d d kr|td|d | j|}||_| j| |d }|d krtdtj|dd	| j|}||_| j| td
t	|d | j|}||_| j| |j dsd S |d j d}|r td|| j|}||_| j| d S )NTrL   zcontent-typer-   FrN   TARGET_WEB_CONTENTWEBSERVER_HTTPHEADERS)ensure_ascii	HTTP_CODEcodeTARGET_WEB_CONTENT_TYPE)
rT   r9   
startswithr   rr   actualSourcers   jsondumpsrR   )	r:   rG   Z
httpresultrt   Zsendcontentctypemtru   hdrr>   r>   r?   rV      s@    










zsfp_spider.contentNotifyc             C   s   ddgS )Nrn   INTERNET_NAMEr>   )r:   r>   r>   r?   watchedEvents   s    zsfp_spider.watchedEventsc             C   s   dgS )Nnoselfr>   )r:   r>   r>   r?   	watchOpts   s    zsfp_spider.watchOptsc             C   s   ddddddgS )Nrw   ry   rn   ro   rv   r{   r>   )r:   r>   r>   r?   producedEvents   s    zsfp_spider.producedEventsc       	      C   s  |j }|j}|j}d }| jjd| d |  || jkrP| jjd| d  d S || j|< |dkrnd|krnd S |dkrxj| jd D ]V}| jj|| | jd	 | jd
 d}|d d k	r|| }td|| j	|}| j
| P qW n|}|d krd S | jjd| d |  || j|< | j|S )NzReceived event, z, from z	Ignoring z* as already spidered or is being spidered.rn   r   r   r+   rI   rJ   )timeout	useragentrN   zInitiating spider of z from )	eventTypemoduledatar1   rQ   r4   r9   rS   r   rr   rs   r\   
spiderFrom)	r:   ru   	eventNamesrcModuleName	eventDataZspiderTargetprefixresevtr>   r>   r?   handleEvent   s6    




zsfp_spider.handleEventc             C   s6  d}d}d}t  }| jj|}| jd r|| jkr| jj|d | jd | jd d}|d d k	r| jjd	|d   | jj|d | j|< | j rd S | j	|}|d kr| jjd
 d S xv|r0t
|dkrt  }x|D ]}	|	| jkr| jjd|	 d  q| j rd S | jjd|	  tj| jd  | j	|	}
|
d k	rN|j|
 |d7 }|| jd kr| jjdt| jd  d  d}P qW | j|}| jjdt|  |d7 }| jjdt| d t|  || jd kr| jjdt| jd  d  d}t
|dkr"| jjd d}| j rd}qW d S )NTr   r&   z/robots.txtrI   rJ   )r   r   rN   zrobots.txt contents: z"No links found on the first fetch!zAlready fetched z, skipping.zFetching fresh content from: r'      r(   zMaximum number of pages (z
) reached.FzFound links: z
At level: z	, Pages: r)   zMaximum number of levels (z*No more links found to spider, finishing..)rh   r1   ri   r9   rk   rS   rQ   parseRobotsTxtcheckForStoprb   r[   r3   timesleepupdater\   rR   rm   )r:   ZstartingPointZkeepSpideringZtotalFetchedZlevelsTraversedZ	nextLinksZ
targetBaseZ	robotsTxtr`   ra   Z
freshLinksr>   r>   r?   r   (  sf    







 zsfp_spider.spiderFrom)N)N)rr   
__module____qualname____doc__r9   optdescsrh   rk   r3   r4   r5   r@   rb   rm   rW   rV   r   r   r   r   r   r>   r>   r>   r?   r      sR   ?*

+*r   )r   r~   sflibr   r   r   r   r>   r>   r>   r?   <module>   s   